• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1 /*
2  * This file is part of the KDE libraries
3  * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
4  * Copyright (C) 2003,2004 Apple Computer, Inc.
5  * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  *
21  */
22 
23 #include "regexp.h"
24 
25 #include "lexer.h"
26 #include <assert.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 using namespace KJS;
32 
33 #ifdef PCRE_CONFIG_UTF8
34 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
35 #endif
36 
37 RegExp::RegExp(const UString &p, int f)
38  : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
39 {
40  // Determine whether libpcre has unicode support if need be..
41 #ifdef PCRE_CONFIG_UTF8
42  if (utf8Support == Unknown) {
43  int supported;
44  pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
45  utf8Support = supported ? Supported : Unsupported;
46  }
47 #endif
48 
49  nrSubPatterns = 0; // determined in match() with POSIX regex.
50 
51  // JS regexps can contain Unicode escape sequences (\uxxxx) which
52  // are rather uncommon elsewhere. As our regexp libs don't understand
53  // them we do the unescaping ourselves internally.
54  // Also make sure to expand out any nulls as pcre_compile
55  // expects null termination..
56  UString intern;
57  const char* const nil = "\\x00";
58  if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
59  bool escape = false;
60  for (int i = 0; i < p.size(); ++i) {
61  UChar c = p[i];
62  if (escape) {
63  escape = false;
64  // we only care about \u
65  if (c == 'u') {
66  // standard unicode escape sequence looks like \uxxxx but
67  // other browsers also accept less then 4 hex digits
68  unsigned short u = 0;
69  int j = 0;
70  for (j = 0; j < 4; ++j) {
71  if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72  u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
73  ++i;
74  } else {
75  // sequence incomplete. restore index.
76  // TODO: cleaner way to propagate warning
77  fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
78  i -= j;
79  break;
80  }
81  }
82  if (j < 4) {
83  // sequence was incomplete. treat \u as u which IE always
84  // and FF sometimes does.
85  intern.append(UString('u'));
86  } else {
87  c = UChar(u);
88  switch (u) {
89  case 0:
90  // Make sure to encode 0, to avoid terminating the string
91  intern += UString(nil);
92  break;
93  case '^':
94  case '$':
95  case '\\':
96  case '.':
97  case '*':
98  case '+':
99  case '?':
100  case '(': case ')':
101  case '{': case '}':
102  case '[': case ']':
103  case '|':
104  // escape pattern characters have to remain escaped
105  intern.append(UString('\\'));
106  // intentional fallthrough
107  default:
108  intern += UString(&c, 1);
109  break;
110  }
111  }
112  continue;
113  }
114  intern += UString('\\');
115  intern += UString(&c, 1);
116  } else {
117  if (c == '\\')
118  escape = true;
119  else if (c == '\0')
120  intern += UString(nil);
121  else
122  intern += UString(&c, 1);
123  }
124  }
125  } else {
126  intern = p;
127  }
128 
129 #ifdef HAVE_PCREPOSIX
130  int pcreflags = 0;
131  const char *perrormsg;
132  int errorOffset;
133 
134  if (flgs & IgnoreCase)
135  pcreflags |= PCRE_CASELESS;
136 
137  if (flgs & Multiline)
138  pcreflags |= PCRE_MULTILINE;
139 
140 #ifdef PCRE_CONFIG_UTF8
141  if (utf8Support == Supported)
142  pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
143 #endif
144 
145  // Fill our buffer with an encoded version, whether utf-8, or,
146  // if PCRE is incapable, truncated.
147  prepareMatch(intern);
148 
149  pcregex = pcre_compile(buffer, pcreflags,
150  &perrormsg, &errorOffset, NULL);
151  doneMatch(); // Cleanup buffers
152  if (!pcregex) {
153 #ifndef NDEBUG
154  fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
155 #endif
156  valid = false;
157  return;
158  }
159 
160 #ifdef PCRE_INFO_CAPTURECOUNT
161  // Get number of subpatterns that will be returned
162  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
163  if (rc != 0)
164 #endif
165  nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
166 
167 #else /* HAVE_PCREPOSIX */
168 
169  int regflags = 0;
170 #ifdef REG_EXTENDED
171  regflags |= REG_EXTENDED;
172 #endif
173 #ifdef REG_ICASE
174  if ( f & IgnoreCase )
175  regflags |= REG_ICASE;
176 #endif
177 
178  //NOTE: Multiline is not feasible with POSIX regex.
179  //if ( f & Multiline )
180  // ;
181  // Note: the Global flag is already handled by RegExpProtoFunc::execute
182 
183  int errorCode = regcomp(&preg, intern.ascii(), regflags);
184  if (errorCode != 0) {
185 #ifndef NDEBUG
186  char errorMessage[80];
187  regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
188  fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
189 #endif
190  valid = false;
191  }
192 #endif
193 }
194 
195 RegExp::~RegExp()
196 {
197  doneMatch(); // Be 100% sure buffers are freed
198 #ifdef HAVE_PCREPOSIX
199  if (pcregex)
200  pcre_free(pcregex);
201 #else
202  /* TODO: is this really okay after an error ? */
203  regfree(&preg);
204 #endif
205 }
206 
207 void RegExp::prepareUtf8(const UString& s)
208 {
209  // Allocate a buffer big enough to hold all the characters plus \0
210  const int length = s.size();
211  buffer = new char[length * 3 + 1];
212 
213  // Also create buffer for positions. We need one extra character in there,
214  // even past the \0 since the non-empty handling may jump one past the end
215  originalPos = new int[length * 3 + 2];
216 
217  // Convert to runs of 8-bit characters, and generate indeces
218  // Note that we do NOT combine surrogate pairs here, as
219  // regexps operate on them as separate characters
220  char *p = buffer;
221  int *posOut = originalPos;
222  const UChar *d = s.data();
223  for (int i = 0; i != length; ++i) {
224  unsigned short c = d[i].unicode();
225 
226  int sequenceLen;
227  if (c < 0x80) {
228  *p++ = (char)c;
229  sequenceLen = 1;
230  } else if (c < 0x800) {
231  *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
232  *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
233  sequenceLen = 2;
234  } else {
235  *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
236  *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
237  *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
238  sequenceLen = 3;
239  }
240 
241  while (sequenceLen > 0) {
242  *posOut = i;
243  ++posOut;
244  --sequenceLen;
245  }
246  }
247 
248  bufferSize = p - buffer;
249 
250  *p++ = '\0';
251 
252  // Record positions for \0, and the fictional character after that.
253  *posOut = length;
254  *(posOut+1) = length+1;
255 }
256 
257 void RegExp::prepareASCII (const UString& s)
258 {
259  originalPos = 0;
260 
261  // Best-effort attempt to get something done
262  // when we don't have utf 8 available -- use
263  // truncated version, and pray for the best
264  CString truncated = s.cstring();
265  buffer = new char[truncated.size() + 1];
266  memcpy(buffer, truncated.c_str(), truncated.size());
267  buffer[truncated.size()] = '\0'; // For _compile use
268  bufferSize = truncated.size();
269 }
270 
271 void RegExp::prepareMatch(const UString &s)
272 {
273  delete[] originalPos; // Just to be sure..
274  delete[] buffer;
275 #ifdef PCRE_CONFIG_UTF8
276  if (utf8Support == Supported)
277  prepareUtf8(s);
278  else
279 #endif
280  prepareASCII(s);
281 
282 #ifndef NDEBUG
283  originalS = s;
284 #endif
285 }
286 
287 void RegExp::doneMatch()
288 {
289  delete[] originalPos; originalPos = 0;
290  delete[] buffer; buffer = 0;
291 }
292 
293 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
294 {
295 #ifndef NDEBUG
296  assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
297 #endif
298  assert(valid);
299 
300  if (i < 0)
301  i = 0;
302  if (ovector)
303  *ovector = 0L;
304  int dummyPos;
305  if (!pos)
306  pos = &dummyPos;
307  *pos = -1;
308  if (i > s.size() || s.isNull())
309  return UString::null;
310 
311 #ifdef HAVE_PCREPOSIX
312  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
313  if (ovector) *ovector = new int[ovecsize];
314  if (!pcregex)
315  return UString::null;
316 
317  int startPos;
318  int nextPos;
319 
320 #ifdef PCRE_CONFIG_UTF8
321  if (utf8Support == Supported) {
322  startPos = i;
323  while (originalPos[startPos] < i)
324  ++startPos;
325 
326  nextPos = startPos;
327  if (i < s.size()) {
328  while (originalPos[nextPos] < (i + 1))
329  ++nextPos;
330  }
331  } else
332 #endif
333  {
334  startPos = i;
335  nextPos = i + (i < s.size() ? 1 : 0);
336  }
337 
338  int baseFlags =
339 #ifdef PCRE_CONFIG_UTF8
340  utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
341 #endif
342  0;
343  int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
344  m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
345  ovector ? *ovector : 0L, ovecsize);
346  if (numMatches < 0)
347  {
348  // Failed to match.
349  if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
350  {
351  // We set m_notEmpty ourselves, to look for a non-empty match
352  // (see man pcretest or pcretest.c for details).
353  // So we don't stop here, we want to try again at i+1.
354 #ifdef KJS_VERBOSE
355  fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
356 #endif
357  m_notEmpty = 0;
358  numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
359  ovector ? *ovector : 0L, ovecsize);
360  if (numMatches < 0)
361  return UString::null;
362  }
363  else // done
364  return UString::null;
365  }
366 
367  // Got a match, proceed with it.
368  // But fix up the ovector if need be..
369  if (ovector && originalPos) {
370  for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
371  if ((*ovector)[c] != -1)
372  (*ovector)[c] = originalPos[(*ovector)[c]];
373  }
374  }
375 
376  if (!ovector)
377  return UString::null; // don't rely on the return value if you pass ovector==0
378 #else
379  const uint maxMatch = 10;
380  regmatch_t rmatch[maxMatch];
381 
382  char *str = strdup(s.ascii()); // TODO: why ???
383  if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
384  free(str);
385  return UString::null;
386  }
387  free(str);
388 
389  if (!ovector) {
390  *pos = rmatch[0].rm_so + i;
391  return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
392  }
393 
394  // map rmatch array to ovector used in PCRE case
395  nrSubPatterns = 0;
396  for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
397  nrSubPatterns++;
398  // if the nonEmpty flag is set, return a failed match if any of the
399  // subMatches happens to be an empty string.
400  if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
401  return UString::null;
402  }
403  // Allow an ovector slot to return the (failed) match result.
404  if (nrSubPatterns == 0) nrSubPatterns = 1;
405 
406  int ovecsize = (nrSubPatterns)*3; // see above
407  *ovector = new int[ovecsize];
408  for (uint j = 0; j < nrSubPatterns; j++) {
409  (*ovector)[2*j] = rmatch[j].rm_so + i;
410  (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
411  }
412 #endif
413 
414  *pos = (*ovector)[0];
415  if ( *pos == (*ovector)[1] && (flgs & Global) )
416  {
417  // empty match, next try will be with m_notEmpty=true
418  m_notEmpty=true;
419  }
420  return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
421 }
422 
423 #if 0 // unused
424 bool RegExp::test(const UString &s, int)
425 {
426 #ifdef HAVE_PCREPOSIX
427  int ovector[300];
428  CString buffer(s.cstring());
429 
430  if (s.isNull() ||
431  pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
432  0, ovector, 300) == PCRE_ERROR_NOMATCH)
433  return false;
434  else
435  return true;
436 
437 #else
438 
439  char *str = strdup(s.ascii());
440  int r = regexec(&preg, str, 0, 0, 0);
441  free(str);
442 
443  return r == 0;
444 #endif
445 }
446 #endif
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:868
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:81
KJS::UString::size
int size() const
Definition: ustring.h:359
KJS::UString::null
static UString null
Static instance of a null string.
Definition: ustring.h:428
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:798
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:480
KJS::CString
8 bit char based string class
Definition: ustring.h:165
KJS::UString
Unicode string class.
Definition: ustring.h:189
KJS::UChar
Unicode character.
Definition: ustring.h:51
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:343
KJS
Definition: array_instance.h:27
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:457
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:485
KJS::UString::data
const UChar * data() const
Definition: ustring.h:339

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •   kspeech
  •     interface
  •     library
  •   tdetexteditor
  • kate
  • kded
  • kdoctools
  • kimgio
  • kjs
  • libtdemid
  • libtdescreensaver
  •     tdecore
  • tdeabc
  • tdecmshell
  • tdecore
  • tdefx
  • tdehtml
  • tdeinit
  • tdeio
  •   bookmarks
  •   httpfilter
  •   kpasswdserver
  •   kssl
  • tdeioslave
  •   http
  •   tdefile
  •   tdeio
  •   tdeioexec
  • tdemdi
  •   tdemdi
  • tdenewstuff
  • tdeparts
  • tdeprint
  • tderandr
  • tderesources
  • tdespell2
  • tdesu
  • tdeui
  • tdeunittest
  • tdeutils
  • tdewallet
Generated for kjs by doxygen 1.8.8
This website is maintained by Timothy Pearson.