root/trunk/meta/regexp2.d

Revision 254, 21.9 kB (checked in by pragma, 2 years ago)

--

Line 
1 // Compile-time regular expression parser, version 2.0
2 // Author: Don Clugston, borrowing heavily from the 1.0 parser written by Eric Anderton.
3 // This version uses mixins for simplicity, flexibility, and high performance.
4
5 // Terminology:
6 // a regex "sequence" is a set of consecutive "term"s,
7 // each of which consists of a "naked term", optionally followed by
8 // a "quantifier" (*,+,?, {m}, {m,} or {m,n}).
9 // A "naked term" is either a "sequence" or an "atom".
10
11 /*
12 Features currently supported:
13    * match previous expression 0 or more times
14    + match previous expression 1 or more times
15    ? match previous expression 0 or 1 times
16    {m,n}  match previous expression between m and n times
17    {m,} match previous expression m or more times
18    {,n} match previous expression between 0 and m times
19    {n} match previous expression exactly n times.
20    . match any character
21    other characters match themselves
22    a|b   match regular expression a or b
23    uncaptured grouping via ( )
24    ^   start of line
25    [abc]  match any character in character class abc
26    [^abc] match any character not in character class abc
27    @n  match string variables passed into the functions as extra parameters. (this is a non-standard extension).
28   
29    All matches are lazy, rather than greedy.
30   
31 Planned, but not yet supported:
32    $   end of line
33    captured grouping
34    escape characters
35    word matching
36    greedy matching
37    full diagnostic error messages
38  
39 Possible:
40   an optimisation pass, including features such as searching the pattern in reverse order.
41   \1..\9 to match previously captured subsequences
42   Perl 6 syntax
43   
44 */
45
46 // Points of interest:
47 // * The parser is able to treat all 'quantifier's in a single mixin function, while still applying
48 //   optimisations (eg, there's absolutely no difference between {1,} and "+").
49 // * There is absolutely no parameter passing inside the regexp engine. Even functions which
50 //   can't be inlined will have very low calling cost.
51 // * Consequently, the speed is excellent. The main unnecessary operations are the checks to see whether we
52 //   are at the end of the string.
53 //   This could be greatly improved by precalculating the minimum length required for a match,
54 //   at least for subsequences of fixed length.
55 // * Since each mixin can be given access to any desired runtime or compile-time parameters,
56 //   the scheme is extremely flexible.
57
58 module meta.regexp2;
59
60 //---------------------------------------------------------------------
61 // Part 0 : Functions from the meta library
62 //---------------------------------------------------------------------
63
64 /******************************************************
65  *  ulong atoui!(char [] s);
66  *
67  *  Converts an ASCII string to an uint.
68  */
69 template atoui(char [] s, uint result=0, int indx=0)
70 {
71     static if (s.length==indx)
72         const uint atoui = result;
73     else static if (s[indx]<'0' || s[indx]>'9')
74         const uint atoui = result;
75     else
76         const uint atoui = atoui!(s, result * 10 + s[indx]-'0', indx+1);
77 }
78
79 //---------------------------------------------------------------------
80 // Part I : Functions for parsing a regular expression string literal.
81 //---------------------------------------------------------------------
82 // None of these generate any code.
83
84 // retuns index of first char in regstr which equals ch, or -1 if not found
85 // escaped chars are ignored
86 template unescapedFindFirst(char [] regstr, char ch, int indx=0)
87 {
88     static if (regstr.length<=indx)
89         const int unescapedFindFirst = -1; // not found
90     else static if (regstr[indx]==ch) const int unescapedFindFirst=indx;
91     else static if (regstr[indx]=='\\')
92         // if it's escaped, prevent it from matching.
93         const int unescapedunescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+2);
94     else const int unescapedFindFirst = unescapedFindFirst!(regstr, ch, indx+1);
95 }
96
97 // Returns the number of chars at the start of regstr which are made up by
98 // a repetition expression (+, *, ?, {,} )
99 template quantifierConsumed(char [] regstr)
100 {
101     static if (regstr.length==0) const int quantifierConsumed = 0;
102     else static if (regstr[0]=='+' || regstr[0]=='*' || regstr[0]=='?') const int quantifierConsumed = 1;
103     else static if (regstr[0]=='{') {
104         static if (unescapedFindFirst!(regstr, '}')==-1) {
105             pragma(msg, "Error: unmatched { in regular expression");
106             static assert(0);
107         } else const int quantifierConsumed = 1 + unescapedFindFirst!(regstr, '}');
108     } else const int quantifierConsumed = 0;
109 }
110
111 // The minimum allowable number of repetitions for this quantifier
112 template quantifierMin(char [] regstr)
113 {
114     static if (regstr[0]=='*' || regstr[0]=='?') const uint quantifierMin = 0;
115     else static if (regstr[0]=='+') const uint quantifierMin = 1;
116     else {
117         static assert (regstr[0]=='{') ;
118         const uint quantifierMin = atoui!(regstr[1..$]);
119      }
120 }
121
122 // The maximum allowable number of repetitions for this quantifier
123 template quantifierMax(char [] regstr)
124 {
125     static if (regstr[0]=='*' || regstr[0]=='+') const uint quantifierMax = uint.max;
126     else static if (regstr[0]=='?') const uint quantifierMax = 1;
127     else static if (regstr[0]=='{') {
128         static if (unescapedFindFirst!(regstr, ',')==-1) // "{n}"
129             const uint quantifierMax = quantifierMin!(regstr);
130         else static if (regstr[$-2]==',') // "{n,}"
131             const uint quantifierMax = uint.max;
132         else // "{n,m}"
133             const uint quantifierMax = atoui!(regstr[ 1+unescapedFindFirst!(regstr, ',') .. $]);
134     } else {
135         pragma(msg, "Error: unsupported quantifier " ~ regstr);
136         static assert(0);
137     }
138 }
139
140 // find the index of the first |, or -1 if not found.
141 // ignores escaped items, and anything in parentheses.
142 template findUnion(char [] regstr, int indx=0, int numopenparens=0)
143 {
144     static if (indx>=regstr.length)
145         const int findUnion = -1;
146     else static if (numopenparens==0 && regstr[indx]=='|')
147         const int findUnion = indx;
148     else static if (regstr[indx]==')')
149         const int findUnion = findUnion!(regstr, indx+1, numopenparens-1);
150     else static if (regstr[indx]=='(')
151         const int findUnion = findUnion!(regstr, indx+1, numopenparens+1);
152     else static if (regstr[indx]=='\\') // skip the escaped character
153         const int findUnion = findUnion!(regstr, indx+2, numopenparens);
154     else
155         const int findUnion = findUnion!(regstr, indx+1, numopenparens);
156 }
157
158 // keeps going until the number of ) parens equals the number of ( parens.
159 // All escaped characters are ignored.
160 // BUG: what about inside [-] ?
161 template parenConsumed(char [] regstr, int numopenparens=0)
162 {
163     static if (regstr.length==0) {
164         pragma(msg, "Unmatched parenthesis");
165         static assert(0);
166     } else static if (regstr[0]==')') {
167         static if (numopenparens==1) const int parenConsumed=1; // finished!
168         else const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens-1);
169     } else static if (regstr[0]=='(') {
170         const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens+1);
171     } else static if (regstr[0]=='\\' && regstr.length>1)
172         // ignore \(, \).
173         const int parenConsumed = 2 + parenConsumed!(regstr[2..$], numopenparens);
174     else
175         const int parenConsumed = 1 + parenConsumed!(regstr[1..$], numopenparens);   
176 }
177
178 // the naked term, with no quantifier. Either an atom, or a subsequence.
179 template atomConsumed(char [] regstr)
180 {
181     static if (regstr[0]=='(') const int atomConsumed = parenConsumed!(regstr);
182     else static if (regstr[0]=='[') const int atomConsumed = 1 + unescapedFindFirst!(regstr, ']');
183     else static if (regstr[0]=='\\') { // escape character
184         static if (regstr.length>0) {
185             const int atomConsumed=2;
186         } else  {
187             pragma(msg, "Error: Regexp must not end with \\");
188             static assert(0);
189         }
190     } else static if (regstr[0]=='@')  { // NONSTANDARD: referenced parameter
191         const int atomConsumed=2;
192     } else const int atomConsumed=1; // match single char
193 }
194
195 // parses a term from the front of regstr (which must not be empty).
196 // consisting of an atom, optionally followed by a quantifier.
197 template termConsumed(char [] regstr)
198 {
199     const int termConsumed = atomConsumed!(regstr) + quantifierConsumed!(regstr[atomConsumed!(regstr)..$]);
200 }
201
202 //---------------------------------------------------------------------
203 // Part II: mixins which generate the final code
204 //---------------------------------------------------------------------
205 // Unlike most regexp engines, which turn the pattern string into a table-based state machine,
206 // this one generates a binary tree of nested functions. Each node in the tree corresponds to
207 // a D template, and is generated as a mixin.
208
209 // At compile time, each mixin is passed a subset of a regexp string.
210 // It generates a member function bool fn(), which updates a pointer p,
211 // and returns true if a match was found.
212
213 // Each mixin has access to the following values:
214 // At compile time:
215 //     fullpattern -- the complete, unparsed regular expression string
216 // At run time:
217 //     searchstr (read only) -- the string being searched
218 //     p --- the first character in searchstr which is not yet matched.
219 //     param[0..8] -- the quasi-static parameter strings @1..@9 to match.
220
221 // Additional variables or constants can be added as desired.
222
223 // Most of the complexity in the regexp engine comes from the optional quantifiers.
224 // In general, they can only determine how far to match by testing if the entire remainder
225 // of the pattern can be matched.
226 //
227 // Each mixin also recieves a template alias 'next'. This has a member bool fn() which
228 // returns true if the remainder of the regexp match is successful.
229 // All regexps must ensure that next.fn is called.
230
231 // Note that unless p is reset to 0, it will automatically behave as a global search,
232 // continuing from the last place it left off.
233
234 template parseRegexp(char [] pattern)
235 {
236     mixin alwaysTrue!() endSequence;
237     mixin regSequence!(pattern, endSequence) allseq;
238     alias allseq.fn fn;
239 }
240
241 template alwaysTrue() // used to mark the end of a sequence
242 {
243    bool fn () { return true; }
244 }
245
246 // regstr is a sequence of productions, possibly containing a union
247 template regSequence(char [] regstr, alias next)
248 {
249     static if (findUnion!(regstr)==-1) {
250         // No unions to worry about
251         mixin regNoUnions!(regstr, next);
252     } else {
253         bool fn() {
254             // Both halves of the union have the same next, inherited from the parent
255             mixin regSequence!(regstr[0..findUnion!(regstr)], next) a;
256             mixin regSequence!(regstr[findUnion!(regstr)+1..$], next) b;
257             int oldp = p;
258             if (a.fn()) return true;
259             p = oldp;
260             return b.fn();
261         }
262     }
263 }
264
265 // regstr is a sequence of terms, all of which must be matched
266 // Does not contain any unions
267 template regNoUnions(char [] regstr, alias next)
268 {
269     static if (regstr.length == termConsumed!(regstr)) {
270         // there's only a single item (possibly including a quantifier)
271         mixin regTerm!(regstr, next);
272     } else {
273         bool fn() {           
274             mixin regSequence!(regstr[termConsumed!(regstr)..$], next) second;
275             mixin regTerm!(regstr[0..termConsumed!(regstr)], second) first;
276             return first.fn();
277         }
278     }
279 }
280
281 // the term without a quantifier. Here we deal with embedded subsequences.
282 template regSingleTerm(char [] regstr, alias next)
283 {
284         static if (regstr[0]=='(') {
285             // A sequence always calls next.
286             mixin regSequence!(regstr[1..$-1], next);
287         } else  {
288             // A simple atom doesn't call next, so we need to do it here.
289             bool fn() {
290                 mixin regAtom!(regstr) a;
291                 return a.fn() && next.fn();
292             }
293         }
294 }
295
296 // Evaluate one term (without quantifier).
297 // This helper class has two purposes:
298 // (1) to restore the 'p' pointer when we return.
299 // (2) ensure that at least one character was consumed
300 template regSequenceDontUpdateP(char [] regstr)
301 {
302     bool fn() {
303         mixin regSequence!(regstr, endSequence) x;
304         // It's only a successful match if _something_ was consumed
305         if (p==theinitialp) return false;       
306         int oldp = p;
307         if (!x.fn()) return false;
308         p = oldp;
309         return true;
310     }
311 }
312
313 //  Calls the naked term twice, but only updates 'p' after the first one.
314 // Evaluate the term, knowing that what comes after will be the same as this.
315 template regTermTwice(char [] regstr)
316 {
317     static if (regstr[0]=='(') {
318         bool fn()
319         {
320             // While evaluating this first sequence, if this is a sequence
321             // that potentially has zero length (ie, everything is a *, ? or {m,} term),
322             // each term should attempt to consume at least one character if possible.
323             int theinitialp = p;
324             mixin regSequenceDontUpdateP!(regstr[1..t-1]) suddendeath;
325             mixin regSequence!(regstr[1..t-1], suddendeath) a;
326             return a.fn();
327        }
328     } else {
329             bool fn() {
330                 // It's easy with atoms, because we know they always eat something.
331                 // BUG: Maybe this will fail when null @n strings are passed in?               
332                 mixin regAtom!(regstr) a;
333                 return a.fn();
334             }   
335     }
336 }
337
338 // the atom, optionally followed by a quantifier.
339 // Here we deal with all kinds of repitition,
340 // but we make different optimisations depending on the allowable repeats.
341 template regTerm(char [] regstr, alias next)
342 {
343     static if (atomConsumed!(regstr)==regstr.length) {
344             // there is no quantifier, just use the naked term
345        mixin regSingleTerm!(regstr, next);
346     } else {
347         bool fn() {
348             const int t = atomConsumed!(regstr);
349             const uint repmin = quantifierMin!(regstr[t..$]);
350             const uint repmax = quantifierMax!(regstr[t..$]);
351            
352             // HORRENDOUSLY inefficient! In some cases, we generate the quantified term THREE TIMES!
353             // The first one contains the rest of the search tree.
354             // This is used when we think we can do (atom).(next) for an early exit
355             mixin regTerm!(regstr[0..t], next) atomAndNext;
356
357             debug writefln(fullpattern, " Quantifier ",regstr ,  " starting at ", searchstr[p..$]);
358            
359             static if (repmin == 0 && repmax == 1) {
360                 // "?", or "{0,1}". Worth optimising seperately
361                 int oldp = p;
362                 if (next.fn()) { return true; }
363                 p = oldp;
364                 return atomAndNext.fn();
365             } else {
366                 // Here's where we generate the redundant term.
367                 // If we can't do (atom).(next), we must be able to do
368                 // (atom).(atom) to stay in the game.                       
369                   mixin regTermTwice!(regstr[0..t]) atomonly;
370
371                 static if (repmin==0 && repmax == uint.max) {
372                     // optimise for "*", "{0,}"
373                     int oldp=p;
374                     if (next.fn()) return true; // We can finish right now.
375                     p=oldp;
376                     do {
377                       // Can we do (atom).(next) ?
378                       oldp = p;
379                       if (atomAndNext.fn()) { return true; }
380                       p = oldp;
381                       // We need to do (atom).(atom) to have any chance of continuing.                 
382                       // also, it must have consumed at least one character, or there is no hope.                 
383                     } while (atomonly.fn() && p != oldp);
384                     return false;
385                 } else { // "+", or "{m,n}"
386                     int numreps=0; // how many repeats have we found?
387                     int oldp;
388                     do {
389                         oldp = p;
390                         numreps++;
391                         if (numreps>=repmin && atomAndNext.fn()) return true;
392                         p = oldp;
393                         static if (repmax<uint.max) {
394                             // optimise for "+", "{n,}"
395                             if (numreps == repmax) return false;
396                         }
397                      } while (atomonly.fn() && p!=oldp);
398                     return false;
399                }
400             }
401         }
402     }
403 }
404
405 // generate a parser for an atom
406 // IN: regstr is a valid atom, without a repeat
407 // OUT: if atom is matched, return true, and update p.
408 //      if atom is not matched, return false, and leave p unchanged.
409 template regAtom(char [] regstr)
410 {
411     static if (regstr[0]=='[') {
412         static if (regstr[1]=='^')
413         {
414             bool fn() { // inverse character class           
415                 return (p<searchstr.length && !charMatches!(regstr[2..$-1])(searchstr[p++]));
416             }
417         } else {
418             bool fn() { // character class
419                 return (p<searchstr.length && charMatches!(regstr[1..$-1])(searchstr[p++]));
420             }
421         }           
422     } else static if (regstr[0]=='.') { // match any
423         bool fn() {
424             if (p==searchstr.length) return false;
425             p++;
426             return true;
427         }
428     } else static if (regstr[0]=='@')  { // NONSTANDARD: referenced parameter
429         mixin regParameter!(atoui!(regstr[1..$])-1);
430     } else static if (regstr[0]=='^') { // start of line
431         bool fn() {
432             return (p==0 || searchstr[p-1]=='\n');
433         }   
434     } else {
435         // match single character
436         bool fn() {
437             if (p==searchstr.length || searchstr[p]!=regstr[0]) return false;
438             p++;
439             return true;
440         }
441     }
442 }
443
444 // match a variable string, which will be passed as a parameter.
445 template regParameter(int parmnum)
446 {
447     bool fn() {
448         if (p + param[parmnum].length > searchstr.length) return false;
449         if (searchstr[p..p+param[parmnum].length] != param[parmnum]) return false;
450         p+=param[parmnum].length;
451         return true;
452     }
453 }
454
455 //"a-zA-Z0-9_"
456
457 // return true if char ch is matched by the character class regstr.
458 template charMatches(char [] regstr)
459 {
460     bool charMatches(char ch) {
461         static if (regstr.length==0) return false;
462         else static if (regstr.length>=3 && regstr[1]=='-') {
463             return (ch>=regstr[0] && ch<=regstr[2]) || charMatches!(regstr[3..$])(ch);
464         } else return (ch==regstr[0]) || charMatches!(regstr[1..$])(ch);
465     }
466 }
467
468 //---------------------------------------------------------------------
469 // Part III: the public interface of the regexp engine
470 //---------------------------------------------------------------------
471
472 // Does the regexp match the pattern?
473 template test(char [] fullpattern)
474 {
475     bool test(char [] searchstr, char [][] param...) {
476         int p = 0; // start at the beginning of the string
477         mixin parseRegexp!(fullpattern) engine;
478         return engine.fn();
479     }
480 }
481
482 /// Return first substring which matches the pattern.
483 /// Note that some patterns will return an empty string as a valid result.
484 template search(char [] fullpattern)
485 {
486     char [] search(char [] searchstr, char [][] param...) {
487         int p; // next index to test
488         mixin parseRegexp!(fullpattern) engine;
489         for (int x=0; x<searchstr.length; ++x) {
490             p=x;
491             if (engine.fn()) return searchstr[x..p];
492         }
493         return "<No match>"; // no match
494     }
495 }
496
497 //---------------------------------------------------------------------
498 //                         EXAMPLE
499 //---------------------------------------------------------------------
500
501 import std.stdio;
502
503 void main()
504 {
505 writefln("BEGINNING UNIT TESTS\n");
506 assert(search!("ab")("aaab")=="ab"); 
507 assert(search!("a*b")("aaab")=="aaab"); 
508 assert(search!("a*(b)")("aaab")=="aaab"); 
509 assert(search!("((a*b))")("aaab")=="aaab"); 
510 assert(search!("(a*)b")("aaab")=="aaab"); 
511 assert(search!("(b*a*)*b")("aaab")=="aaab"); 
512 assert(search!("b+cd")("acdbbcabbcdaaab")=="bbcd");
513 assert(search!("b?cd")("abcacbacdb")=="cd");
514 assert(search!("(ab)?abc")("aababcab")=="ababc");
515 assert(search!("(ab)*abc")("aababcab")=="ababc");
516 assert(search!("((a)*|xyz)b")("aaab")=="aaab"); 
517 assert(search!("(ab)*(abb)")("bababb")=="ababb"); 
518 assert(search!("e?(ab)*b+")("eaaababbbbaac")=="ababb");
519 assert(search!("(ab*)*c")("bbbababbaaabaaaabbbbc") == "ababbaaabaaaabbbbc");
520 char [] quasistatic="m";
521 assert(search!("(@1.*@1)")("they said D can't do metaprogramming?", quasistatic)=="metaprogram");
522 assert(search!("[h-za]*g")("metaprogramming")=="taprog");
523 assert(search!("(a*)*b")("cacaaab")=="aaab");
524 assert(search!("(a*b*)*c")("dababdaabababbaaabbbcab")=="aabababbaaabbbc");
525 assert(search!("((a*b*)|da)*b")("fasdaaab")=="daaab");
526
527 char [] qq;
528 writefln("=========");
529
530 qq = search!("((a*b*)|da)*b")("fasdaaab");
531 writefln("Result: ----",qq, "---");
532 writefln("=========");
533
534 }
535
536 //-------------------------------------------------------------------------------
537 /+
538
539 // NOT CURRENTLY USED
540
541 // Finds the number of instances of 'ch' in str which aren't preceded by a backslash
542 // ch must not be a backslash.
543 template unescapedCount(char [] str, char ch)
544 {
545     static if (str.length==0) const int unescapedCount = 0;
546     else static if (str[0]=='\\' && str.length>1) const int unescapedCount = unescapedCount!(str[2..$], ch);
547     else static if (str[0]==ch) const int unescapedCount = 1 + unescapedCount!(str[1..$], ch);
548     else const int unescapedCount = unescapedCount!(str[1..$], ch);
549 }
550
551 +/
552
553 //-------------
554 // unit tests
555 //-------------
556 version (testmeta) {
557     static assert(quantifierConsumed!("{456}345")==5);
558     static assert(parenConsumed!("(45(6)4)5")==8);
559     static assert(parenConsumed!(`(45\(6)45`)==7);
560 }
Note: See TracBrowser for help on using the browser.