| 36 | | a string controlling the interpretation |
|---|
| 37 | | of the regular expression. |
|---|
| 38 | | It consists of a sequence of one or more |
|---|
| 39 | | of the following characters: |
|---|
| 40 | | |
|---|
| 41 | | <table border=1 cellspacing=0 cellpadding=5> |
|---|
| 42 | | <caption>Attribute Characters</caption> |
|---|
| 43 | | $(TR $(TH Attribute) $(TH Action)) |
|---|
| 44 | | <tr> |
|---|
| 45 | | $(TD $(B g)) |
|---|
| 46 | | $(TD global; repeat over the whole input string) |
|---|
| 47 | | </tr> |
|---|
| 48 | | <tr> |
|---|
| 49 | | $(TD $(B i)) |
|---|
| 50 | | $(TD case insensitive) |
|---|
| 51 | | </tr> |
|---|
| 52 | | <tr> |
|---|
| 53 | | $(TD $(B m)) |
|---|
| 54 | | $(TD treat as multiple lines separated by newlines) |
|---|
| 55 | | </tr> |
|---|
| 56 | | </table> |
|---|
| | 36 | a string controlling the interpretation |
|---|
| | 37 | of the regular expression. |
|---|
| | 38 | It consists of a sequence of one or more |
|---|
| | 39 | of the following characters: |
|---|
| | 40 | |
|---|
| | 41 | <table border=1 cellspacing=0 cellpadding=5> |
|---|
| | 42 | <caption>Attribute Characters</caption> |
|---|
| | 43 | $(TR $(TH Attribute) $(TH Action)) |
|---|
| | 44 | <tr> |
|---|
| | 45 | $(TD $(B g)) |
|---|
| | 46 | $(TD global; repeat over the whole input string) |
|---|
| | 47 | </tr> |
|---|
| | 48 | <tr> |
|---|
| | 49 | $(TD $(B i)) |
|---|
| | 50 | $(TD case insensitive) |
|---|
| | 51 | </tr> |
|---|
| | 52 | <tr> |
|---|
| | 53 | $(TD $(B m)) |
|---|
| | 54 | $(TD treat as multiple lines separated by newlines) |
|---|
| | 55 | </tr> |
|---|
| | 56 | </table> |
|---|
| 60 | | * <table border=1 cellspacing=0 cellpadding=5> |
|---|
| 61 | | <caption>Formatting Characters</caption> |
|---|
| 62 | | $(TR $(TH Format) $(TH Replaced With)) |
|---|
| 63 | | $(TR |
|---|
| 64 | | $(TD $(B $$)) $(TD $) |
|---|
| 65 | | ) |
|---|
| 66 | | $(TR |
|---|
| 67 | | $(TD $(B $&)) $(TD The matched substring.) |
|---|
| 68 | | ) |
|---|
| 69 | | $(TR |
|---|
| 70 | | $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.) |
|---|
| 71 | | ) |
|---|
| 72 | | $(TR |
|---|
| 73 | | $(TD $(B $')) $(TD The portion of string that follows the matched substring.) |
|---|
| 74 | | ) |
|---|
| 75 | | $(TR |
|---|
| 76 | | $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n) |
|---|
| 77 | | is a single digit 1-9 |
|---|
| 78 | | and $$(I n) is not followed by a decimal digit.) |
|---|
| 79 | | ) |
|---|
| 80 | | $(TR |
|---|
| 81 | | $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn) |
|---|
| 82 | | is a two-digit decimal |
|---|
| 83 | | number 01-99. |
|---|
| 84 | | If $(I nn)th capture is undefined or more than the number |
|---|
| 85 | | of parenthesized subexpressions, use the empty |
|---|
| 86 | | string instead.) |
|---|
| 87 | | ) |
|---|
| 88 | | </table> |
|---|
| 89 | | |
|---|
| 90 | | * Any other $ are left as is. |
|---|
| | 60 | * <table border=1 cellspacing=0 cellpadding=5> |
|---|
| | 61 | <caption>Formatting Characters</caption> |
|---|
| | 62 | $(TR $(TH Format) $(TH Replaced With)) |
|---|
| | 63 | $(TR |
|---|
| | 64 | $(TD $(B $$)) $(TD $) |
|---|
| | 65 | ) |
|---|
| | 66 | $(TR |
|---|
| | 67 | $(TD $(B $&)) $(TD The matched substring.) |
|---|
| | 68 | ) |
|---|
| | 69 | $(TR |
|---|
| | 70 | $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.) |
|---|
| | 71 | ) |
|---|
| | 72 | $(TR |
|---|
| | 73 | $(TD $(B $')) $(TD The portion of string that follows the matched substring.) |
|---|
| | 74 | ) |
|---|
| | 75 | $(TR |
|---|
| | 76 | $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n) |
|---|
| | 77 | is a single digit 1-9 |
|---|
| | 78 | and $$(I n) is not followed by a decimal digit.) |
|---|
| | 79 | ) |
|---|
| | 80 | $(TR |
|---|
| | 81 | $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn) |
|---|
| | 82 | is a two-digit decimal |
|---|
| | 83 | number 01-99. |
|---|
| | 84 | If $(I nn)th capture is undefined or more than the number |
|---|
| | 85 | of parenthesized subexpressions, use the empty |
|---|
| | 86 | string instead.) |
|---|
| | 87 | ) |
|---|
| | 88 | </table> |
|---|
| | 89 | |
|---|
| | 90 | * Any other $ are left as is. |
|---|
| 100 | | Escape sequences: |
|---|
| 101 | | |
|---|
| 102 | | \nnn starts out a 1, 2 or 3 digit octal sequence, |
|---|
| 103 | | where n is an octal digit. If nnn is larger than |
|---|
| 104 | | 0377, then the 3rd digit is not part of the sequence |
|---|
| 105 | | and is not consumed. |
|---|
| 106 | | For maximal portability, use exactly 3 digits. |
|---|
| 107 | | |
|---|
| 108 | | \xXX starts out a 1 or 2 digit hex sequence. X |
|---|
| 109 | | is a hex character. If the first character after the \x |
|---|
| 110 | | is not a hex character, the value of the sequence is 'x' |
|---|
| 111 | | and the XX are not consumed. |
|---|
| 112 | | For maximal portability, use exactly 2 digits. |
|---|
| 113 | | |
|---|
| 114 | | \uUUUU is a unicode sequence. There are exactly |
|---|
| 115 | | 4 hex characters after the \u, if any are not, then |
|---|
| 116 | | the value of the sequence is 'u', and the UUUU are not |
|---|
| 117 | | consumed. |
|---|
| 118 | | |
|---|
| 119 | | Character classes: |
|---|
| 120 | | |
|---|
| 121 | | [a-b], where a is greater than b, will produce |
|---|
| 122 | | an error. |
|---|
| 123 | | |
|---|
| 124 | | References: |
|---|
| 125 | | |
|---|
| 126 | | http://www.unicode.org/unicode/reports/tr18/ |
|---|
| | 100 | Escape sequences: |
|---|
| | 101 | |
|---|
| | 102 | \nnn starts out a 1, 2 or 3 digit octal sequence, |
|---|
| | 103 | where n is an octal digit. If nnn is larger than |
|---|
| | 104 | 0377, then the 3rd digit is not part of the sequence |
|---|
| | 105 | and is not consumed. |
|---|
| | 106 | For maximal portability, use exactly 3 digits. |
|---|
| | 107 | |
|---|
| | 108 | \xXX starts out a 1 or 2 digit hex sequence. X |
|---|
| | 109 | is a hex character. If the first character after the \x |
|---|
| | 110 | is not a hex character, the value of the sequence is 'x' |
|---|
| | 111 | and the XX are not consumed. |
|---|
| | 112 | For maximal portability, use exactly 2 digits. |
|---|
| | 113 | |
|---|
| | 114 | \uUUUU is a unicode sequence. There are exactly |
|---|
| | 115 | 4 hex characters after the \u, if any are not, then |
|---|
| | 116 | the value of the sequence is 'u', and the UUUU are not |
|---|
| | 117 | consumed. |
|---|
| | 118 | |
|---|
| | 119 | Character classes: |
|---|
| | 120 | |
|---|
| | 121 | [a-b], where a is greater than b, will produce |
|---|
| | 122 | an error. |
|---|
| | 123 | |
|---|
| | 124 | References: |
|---|
| | 125 | |
|---|
| | 126 | http://www.unicode.org/unicode/reports/tr18/ |
|---|
| 250 | | int so = r.pmatch[0].rm_so; |
|---|
| 251 | | int eo = r.pmatch[0].rm_eo; |
|---|
| 252 | | |
|---|
| 253 | | rchar[] replacement = dg(r); |
|---|
| 254 | | |
|---|
| 255 | | // Optimize by using std.string.replace if possible - Dave Fladebo |
|---|
| 256 | | rchar[] slice = result[offset + so .. offset + eo]; |
|---|
| 257 | | if (r.attributes & RegExp.REA.global && // global, so replace all |
|---|
| 258 | | !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case |
|---|
| 259 | | !(r.attributes & RegExp.REA.multiline) && // not multiline |
|---|
| 260 | | pattern == slice) // simple pattern (exact match, no special characters) |
|---|
| 261 | | { |
|---|
| 262 | | debug(regexp) |
|---|
| 263 | | printf("pattern: %.*s, slice: %.*s, replacement: %.*s\n",pattern,result[offset + so .. offset + eo],replacement); |
|---|
| 264 | | result = std.string.replace(result,slice,replacement); |
|---|
| 265 | | break; |
|---|
| 266 | | } |
|---|
| 267 | | |
|---|
| 268 | | result = replaceSlice(result, result[offset + so .. offset + eo], replacement); |
|---|
| 269 | | |
|---|
| 270 | | if (r.attributes & RegExp.REA.global) |
|---|
| 271 | | { |
|---|
| 272 | | offset += replacement.length - (eo - so); |
|---|
| 273 | | |
|---|
| 274 | | if (lastindex == eo) |
|---|
| 275 | | lastindex++; // always consume some source |
|---|
| 276 | | else |
|---|
| 277 | | lastindex = eo; |
|---|
| 278 | | } |
|---|
| 279 | | else |
|---|
| 280 | | break; |
|---|
| | 249 | int so = r.pmatch[0].rm_so; |
|---|
| | 250 | int eo = r.pmatch[0].rm_eo; |
|---|
| | 251 | |
|---|
| | 252 | rchar[] replacement = dg(r); |
|---|
| | 253 | |
|---|
| | 254 | // Optimize by using std.string.replace if possible - Dave Fladebo |
|---|
| | 255 | rchar[] slice = result[offset + so .. offset + eo]; |
|---|
| | 256 | if (r.attributes & RegExp.REA.global && // global, so replace all |
|---|
| | 257 | !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case |
|---|
| | 258 | !(r.attributes & RegExp.REA.multiline) && // not multiline |
|---|
| | 259 | pattern == slice) // simple pattern (exact match, no special characters) |
|---|
| | 260 | { |
|---|
| | 261 | debug(regexp) |
|---|
| | 262 | printf("pattern: %.*s, slice: %.*s, replacement: %.*s\n",pattern,result[offset + so .. offset + eo],replacement); |
|---|
| | 263 | result = std.string.replace(result,slice,replacement); |
|---|
| | 264 | break; |
|---|
| | 265 | } |
|---|
| | 266 | |
|---|
| | 267 | result = replaceSlice(result, result[offset + so .. offset + eo], replacement); |
|---|
| | 268 | |
|---|
| | 269 | if (r.attributes & RegExp.REA.global) |
|---|
| | 270 | { |
|---|
| | 271 | offset += replacement.length - (eo - so); |
|---|
| | 272 | |
|---|
| | 273 | if (lastindex == eo) |
|---|
| | 274 | lastindex++; // always consume some source |
|---|
| | 275 | else |
|---|
| | 276 | lastindex = eo; |
|---|
| | 277 | } |
|---|
| | 278 | else |
|---|
| | 279 | break; |
|---|
| 497 | | if (auto m = std.regexp.search("abcdef", "c()")) |
|---|
| 498 | | { |
|---|
| 499 | | auto result = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post); |
|---|
| 500 | | assert(result == "ab[c]def"); |
|---|
| 501 | | assert(m.match(1) == null); |
|---|
| 502 | | assert(m.match(2) == null); |
|---|
| | 496 | if (auto m = regexp.search("abcdef", "c()")) |
|---|
| | 497 | { |
|---|
| | 498 | auto result = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post); |
|---|
| | 499 | assert(result == "ab[c]def"); |
|---|
| | 500 | assert(m.match(1) == null); |
|---|
| | 501 | assert(m.match(2) == null); |
|---|
| 541 | | pmatch = (&gmatch)[0 .. 1]; |
|---|
| 542 | | compile(pattern, attributes); |
|---|
| 543 | | } |
|---|
| | 552 | input = null; |
|---|
| | 553 | pmatch = (&gmatch)[0 .. 1]; |
|---|
| | 554 | this.pattern = pattern; |
|---|
| | 555 | this.flags = attributes; |
|---|
| | 556 | |
|---|
| | 557 | compile(pattern, attributes); |
|---|
| | 558 | } |
|---|
| | 559 | |
|---|
| | 560 | public this(rchar[] rawpattern, rchar[] rawattributes, ubyte [] precompiledprogram, uint precompiledattributes, uint precompiledCaptures) |
|---|
| | 561 | { |
|---|
| | 562 | input = null; |
|---|
| | 563 | pmatch = (&gmatch)[0 .. 1]; |
|---|
| | 564 | this.pattern = rawpattern; |
|---|
| | 565 | this.flags = rawattributes; |
|---|
| | 566 | |
|---|
| | 567 | attributes = precompiledattributes; |
|---|
| | 568 | re_nsub = precompiledCaptures; |
|---|
| | 569 | program = precompiledprogram; |
|---|
| | 570 | } |
|---|
| | 571 | |
|---|
| 565 | | debug(regexp) printf("regexp.opCall.unittest()\n"); |
|---|
| 566 | | auto r1 = RegExp("hello", "m"); |
|---|
| 567 | | char[] msg; |
|---|
| 568 | | try |
|---|
| 569 | | { |
|---|
| 570 | | auto r2 = RegExp("hello", "q"); |
|---|
| 571 | | assert(0); |
|---|
| 572 | | } |
|---|
| 573 | | catch (RegExpException ree) |
|---|
| 574 | | { |
|---|
| 575 | | msg = ree.toString(); |
|---|
| 576 | | //writefln("message: %s", ree); |
|---|
| 577 | | } |
|---|
| 578 | | assert(msg == "unrecognized attribute"); |
|---|
| | 596 | debug(regexp) printf("regexp.opCall.unittest()\n"); |
|---|
| | 597 | auto r1 = RegExp("hello", "m"); |
|---|
| | 598 | char[] msg; |
|---|
| | 599 | try |
|---|
| | 600 | { |
|---|
| | 601 | auto r2 = RegExp("hello", "q"); |
|---|
| | 602 | assert(0); |
|---|
| | 603 | } |
|---|
| | 604 | catch (RegExpException ree) |
|---|
| | 605 | { |
|---|
| | 606 | msg = ree.toString(); |
|---|
| | 607 | //writefln("message: %s", ree); |
|---|
| | 608 | } |
|---|
| | 609 | assert(msg == "unrecognized attribute"); |
|---|
| 630 | | debug(regexp) printf("regexp.search.unittest()\n"); |
|---|
| 631 | | |
|---|
| 632 | | int i; |
|---|
| 633 | | foreach(m; RegExp("ab").search("abcabcabab")) |
|---|
| 634 | | { |
|---|
| 635 | | auto s = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post); |
|---|
| 636 | | if (i == 0) assert(s == "[ab]cabcabab"); |
|---|
| 637 | | else if (i == 1) assert(s == "abc[ab]cabab"); |
|---|
| 638 | | else if (i == 2) assert(s == "abcabc[ab]ab"); |
|---|
| 639 | | else if (i == 3) assert(s == "abcabcab[ab]"); |
|---|
| 640 | | else assert(0); |
|---|
| 641 | | i++; |
|---|
| 642 | | } |
|---|
| | 661 | debug(regexp) printf("regexp.search.unittest()\n"); |
|---|
| | 662 | |
|---|
| | 663 | int i; |
|---|
| | 664 | foreach(m; rexSearch!("ab")("abcabcabab")) |
|---|
| | 665 | { |
|---|
| | 666 | auto s = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post); |
|---|
| | 667 | if (i == 0) assert(s == "[ab]cabcabab"); |
|---|
| | 668 | else if (i == 1) assert(s == "abc[ab]cabab"); |
|---|
| | 669 | else if (i == 2) assert(s == "abcabc[ab]ab"); |
|---|
| | 670 | else if (i == 3) assert(s == "abcabcab[ab]"); |
|---|
| | 671 | else assert(0); |
|---|
| | 672 | i++; |
|---|
| | 673 | } |
|---|
| 700 | | global = 1, // has the g attribute |
|---|
| 701 | | ignoreCase = 2, // has the i attribute |
|---|
| 702 | | multiline = 4, // if treat as multiple lines separated |
|---|
| 703 | | // by newlines, or as a single line |
|---|
| 704 | | dotmatchlf = 8, // if . matches \n |
|---|
| | 728 | global = 1, // has the g attribute |
|---|
| | 729 | ignoreCase = 2, // has the i attribute |
|---|
| | 730 | multiline = 4, // if treat as multiple lines separated |
|---|
| | 731 | // by newlines, or as a single line |
|---|
| | 732 | dotmatchlf = 8, // if . matches \n |
|---|
| 727 | | REend, // end of program |
|---|
| 728 | | REchar, // single character |
|---|
| 729 | | REichar, // single character, case insensitive |
|---|
| 730 | | REdchar, // single UCS character |
|---|
| 731 | | REidchar, // single wide character, case insensitive |
|---|
| 732 | | REanychar, // any character |
|---|
| 733 | | REanystar, // ".*" |
|---|
| 734 | | REstring, // string of characters |
|---|
| 735 | | REistring, // string of characters, case insensitive |
|---|
| 736 | | REtestbit, // any in bitmap, non-consuming |
|---|
| 737 | | REbit, // any in the bit map |
|---|
| 738 | | REnotbit, // any not in the bit map |
|---|
| 739 | | RErange, // any in the string |
|---|
| 740 | | REnotrange, // any not in the string |
|---|
| 741 | | REor, // a | b |
|---|
| 742 | | REplus, // 1 or more |
|---|
| 743 | | REstar, // 0 or more |
|---|
| 744 | | REquest, // 0 or 1 |
|---|
| 745 | | REnm, // n..m |
|---|
| 746 | | REnmq, // n..m, non-greedy version |
|---|
| 747 | | REbol, // beginning of line |
|---|
| 748 | | REeol, // end of line |
|---|
| 749 | | REparen, // parenthesized subexpression |
|---|
| 750 | | REgoto, // goto offset |
|---|
| | 750 | REend, // end of program |
|---|
| | 751 | REchar, // single character |
|---|
| | 752 | REichar, // single character, case insensitive |
|---|
| | 753 | REdchar, // single UCS character |
|---|
| | 754 | REidchar, // single wide character, case insensitive |
|---|
| | 755 | REanychar, // any character |
|---|
| | 756 | REanystar, // ".*" |
|---|
| | 757 | REstring, // string of characters |
|---|
| | 758 | REistring, // string of characters, case insensitive |
|---|
| | 759 | REtestbit, // any in bitmap, non-consuming |
|---|
| | 760 | REbit, // any in the bit map |
|---|
| | 761 | REnotbit, // any not in the bit map |
|---|
| | 762 | RErange, // any in the string |
|---|
| | 763 | REnotrange, // any not in the string |
|---|
| | 764 | REor, // a | b |
|---|
| | 765 | REplus, // 1 or more |
|---|
| | 766 | REstar, // 0 or more |
|---|
| | 767 | REquest, // 0 or 1 |
|---|
| | 768 | REnm, // n..m |
|---|
| | 769 | REnmq, // n..m, non-greedy version |
|---|
| | 770 | REbol, // beginning of line |
|---|
| | 771 | REeol, // end of line |
|---|
| | 772 | REparen, // parenthesized subexpression |
|---|
| | 773 | REgoto, // goto offset |
|---|
| 772 | | public void compile(rchar[] pattern, rchar[] attributes) |
|---|
| 773 | | { |
|---|
| | 802 | public void compile(rchar[] pattern, rchar[] patternattributes) |
|---|
| | 803 | { |
|---|
| | 804 | this.flags = patternattributes; |
|---|
| | 805 | |
|---|
| | 806 | uint oldre_nsub = re_nsub; |
|---|
| | 807 | uint numCaptures = 0; |
|---|
| | 808 | this.attributes = compileAttributes(patternattributes); |
|---|
| | 809 | |
|---|
| | 810 | program = compilePattern(pattern, numCaptures, attributes); |
|---|
| | 811 | re_nsub = numCaptures; |
|---|
| | 812 | if (re_nsub > oldre_nsub) |
|---|
| | 813 | { |
|---|
| | 814 | if (pmatch is &gmatch) |
|---|
| | 815 | pmatch = null; |
|---|
| | 816 | pmatch.length = re_nsub + 1; |
|---|
| | 817 | } |
|---|
| | 818 | pmatch[0].rm_so = 0; |
|---|
| | 819 | pmatch[0].rm_eo = 0; |
|---|
| | 820 | } |
|---|
| | 821 | |
|---|
| | 822 | template metaCompileAttributes(rchar[] patternattributes, uint sofar = 0) |
|---|
| | 823 | { |
|---|
| | 824 | static if (patternattributes.length==0) { |
|---|
| | 825 | const uint metaCompileAttributes = sofar; |
|---|
| | 826 | } else static if (patternattributes[0]=='g') { |
|---|
| | 827 | static assert(!(sofar & REA.global), "Regexp error: Redundant attribute"); |
|---|
| | 828 | const uint metaCompileAttributes = metaCompileAttributes!(patternattributes[1..$], sofar | REA.global); |
|---|
| | 829 | } else static if (patternattributes[0]=='i') { |
|---|
| | 830 | static assert(!(sofar & REA.ignoreCase), "Regexp error: Redundant attribute"); |
|---|
| | 831 | const uint metaCompileAttributes = metaCompileAttributes!(patternattributes[1..$], sofar | REA.ignoreCase); |
|---|
| | 832 | } else static if (patternattributes[0]=='m') { |
|---|
| | 833 | static assert(!(sofar & REA.multiline), "Regexp error: Redundant attribute"); |
|---|
| | 834 | const uint metaCompileAttributes = metaCompileAttributes!(patternattributes[1..$], sofar | REA.multiline); |
|---|
| | 835 | } else static assert(0, "Regexp error: unrecognized attribute: " ~ patternattributes[0]); |
|---|
| | 836 | |
|---|
| | 837 | } |
|---|
| | 838 | |
|---|
| | 839 | unittest { |
|---|
| | 840 | static assert(RegExp.metaCompileAttributes!("mig")==7); |
|---|
| | 841 | } |
|---|
| | 842 | |
|---|
| | 843 | // p is following the \ char |
|---|
| | 844 | // none of the cases are multibyte |
|---|
| | 845 | template escapechar(char [] pattern, int p) |
|---|
| | 846 | { |
|---|
| | 847 | static if (pattern[p]=='b') const dchar escapechar = '\b'; |
|---|
| | 848 | else static if (pattern[p]=='f') const dchar escapechar = '\f'; |
|---|
| | 849 | else static if (pattern[p]=='n') const dchar escapechar = '\n'; |
|---|
| | 850 | else static if (pattern[p]=='r') const dchar escapechar = '\r'; |
|---|
| | 851 | else static if (pattern[p]=='t') const dchar escapechar = '\t'; |
|---|
| | 852 | else static if (pattern[p]=='v') const dchar escapechar = '\v'; |
|---|
| | 853 | else static if (pattern[p]=='c') { |
|---|
| | 854 | static if (pattern.length==p+1) const dchar escapechar = 'c'; |
|---|
| | 855 | // Note: we are deliberately not allowing dchar letters |
|---|
| | 856 | else static if ((('a' <= pattern[p+1] && pattern[p+1] <= 'z') |
|---|
| | 857 | || ('A' <= pattern[p+1] && pattern[p+1] <= 'Z'))) { |
|---|
| | 858 | const dchar escapechar = pattern[p+1] & 0x1F; |
|---|
| | 859 | } else static assert(0, `letter expected following \c`); |
|---|
| | 860 | } else static if (pattern[p]>='0' && pattern[p]<='7') { |
|---|
| | 861 | // who uses octal??? |
|---|
| | 862 | } else static if (pattern[p]=='x') { |
|---|
| | 863 | } else static if (pattern[p]=='u') { |
|---|
| | 864 | } else const dchar escapechar = pattern[p]; |
|---|
| | 865 | } |
|---|
| | 866 | |
|---|
| | 867 | static uint compileAttributes(rchar[] patternattributes) |
|---|
| | 868 | { |
|---|
| | 869 | uint attributes = 0; |
|---|
| | 870 | foreach (rchar c; patternattributes) |
|---|
| | 871 | { REA att; |
|---|
| | 872 | |
|---|
| | 873 | switch (c) |
|---|
| | 874 | { |
|---|
| | 875 | case 'g': att = REA.global; break; |
|---|
| | 876 | case 'i': att = REA.ignoreCase; break; |
|---|
| | 877 | case 'm': att = REA.multiline; break; |
|---|
| | 878 | default: |
|---|
| | 879 | throw new RegExpException("unrecognized attribute"); |
|---|
| | 880 | assert(0); |
|---|
| | 881 | } |
|---|
| | 882 | if (attributes & att) |
|---|
| | 883 | { throw new RegExpException("redundant attribute"); |
|---|
| | 884 | assert(0); |
|---|
| | 885 | } |
|---|
| | 886 | attributes |= att; |
|---|
| | 887 | } |
|---|
| | 888 | return attributes; |
|---|
| | 889 | } |
|---|
| | 890 | |
|---|
| | 891 | static ubyte[] compilePattern(rchar[] pattern, out uint re_nsub, uint attributes) |
|---|
| | 892 | { |
|---|
| | 893 | OutBuffer buf; |
|---|
| | 894 | size_t p; // position of parser in pattern[] |
|---|
| | 895 | |
|---|
| | 896 | int errors; |
|---|
| | 897 | |
|---|
| 776 | | this.attributes = 0; |
|---|
| 777 | | foreach (rchar c; attributes) |
|---|
| 778 | | { REA att; |
|---|
| 779 | | |
|---|
| 780 | | switch (c) |
|---|
| 781 | | { |
|---|
| 782 | | case 'g': att = REA.global; break; |
|---|
| 783 | | case 'i': att = REA.ignoreCase; break; |
|---|
| 784 | | case 'm': att = REA.multiline; break; |
|---|
| 785 | | default: |
|---|
| 786 | | error("unrecognized attribute"); |
|---|
| 787 | | return; |
|---|
| 788 | | } |
|---|
| 789 | | if (this.attributes & att) |
|---|
| 790 | | { error("redundant attribute"); |
|---|
| 791 | | return; |
|---|
| 792 | | } |
|---|
| 793 | | this.attributes |= att; |
|---|
| 794 | | } |
|---|
| 795 | | |
|---|
| 796 | | input = null; |
|---|
| 797 | | |
|---|
| 798 | | this.pattern = pattern; |
|---|
| 799 | | this.flags = attributes; |
|---|
| 800 | | |
|---|
| 801 | | uint oldre_nsub = re_nsub; |
|---|
| | 900 | void error(char[] msg) |
|---|
| | 901 | { |
|---|
| | 902 | errors++; |
|---|
| | 903 | debug(regexp) printf("error: %.*s\n", msg); |
|---|
| | 904 | //assert(0); |
|---|
| | 905 | //*(char*)0=0; |
|---|
| | 906 | throw new RegExpException(msg); |
|---|
| | 907 | } |
|---|
| | 908 | |
|---|
| | 909 | |
|---|
| | 910 | // p is following the \ char |
|---|
| | 911 | int escape() |
|---|
| | 912 | in |
|---|
| | 913 | { |
|---|
| | 914 | assert(p < pattern.length); |
|---|
| | 915 | } |
|---|
| | 916 | body |
|---|
| | 917 | { int c; |
|---|
| | 918 | int i; |
|---|
| | 919 | rchar tc; |
|---|
| | 920 | |
|---|
| | 921 | c = pattern[p]; // none of the cases are multibyte |
|---|
| | 922 | switch (c) |
|---|
| | 923 | { |
|---|
| | 924 | case 'b': c = '\b'; break; |
|---|
| | 925 | case 'f': c = '\f'; break; |
|---|
| | 926 | case 'n': c = '\n'; break; |
|---|
| | 927 | case 'r': c = '\r'; break; |
|---|
| | 928 | case 't': c = '\t'; break; |
|---|
| | 929 | case 'v': c = '\v'; break; |
|---|
| | 930 | |
|---|
| | 931 | // BUG: Perl does \a and \e too, should we? |
|---|
| | 932 | |
|---|
| | 933 | case 'c': |
|---|
| | 934 | ++p; |
|---|
| | 935 | if (p == pattern.length) |
|---|
| | 936 | goto Lretc; |
|---|
| | 937 | c = pattern[p]; |
|---|
| | 938 | // Note: we are deliberately not allowing dchar letters |
|---|
| | 939 | if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))) |
|---|
| | 940 | { |
|---|
| | 941 | Lcerr: |
|---|
| | 942 | error("letter expected following \\c"); |
|---|
| | 943 | return 0; |
|---|
| | 944 | } |
|---|
| | 945 | c &= 0x1F; |
|---|
| | 946 | break; |
|---|
| | 947 | |
|---|
| | 948 | case '0': |
|---|
| | 949 | case '1': |
|---|
| | 950 | case '2': |
|---|
| | 951 | case '3': |
|---|
| | 952 | case '4': |
|---|
| | 953 | case '5': |
|---|
| | 954 | case '6': |
|---|
| | 955 | case '7': |
|---|
| | 956 | c -= '0'; |
|---|
| | 957 | for (i = 0; i < 2; i++) |
|---|
| | 958 | { |
|---|
| | 959 | p++; |
|---|
| | 960 | if (p == pattern.length) |
|---|
| | 961 | goto Lretc; |
|---|
| | 962 | tc = pattern[p]; |
|---|
| | 963 | if ('0' <= tc && tc <= '7') |
|---|
| | 964 | { c = c * 8 + (tc - '0'); |
|---|
| | 965 | // Treat overflow as if last |
|---|
| | 966 | // digit was not an octal digit |
|---|
| | 967 | if (c >= 0xFF) |
|---|
| | 968 | { c >>= 3; |
|---|
| | 969 | return c; |
|---|
| | 970 | } |
|---|
| | 971 | } |
|---|
| | 972 | else |
|---|
| | 973 | return c; |
|---|
| | 974 | } |
|---|
| | 975 | break; |
|---|
| | 976 | |
|---|
| | 977 | case 'x': |
|---|
| | 978 | c = 0; |
|---|
| | 979 | for (i = 0; i < 2; i++) |
|---|
| | 980 | { |
|---|
| | 981 | p++; |
|---|
| | 982 | if (p == pattern.length) |
|---|
| | 983 | goto Lretc; |
|---|
| | 984 | tc = pattern[p]; |
|---|
| | 985 | if ('0' <= tc && tc <= '9') |
|---|
| | 986 | c = c * 16 + (tc - '0'); |
|---|
| | 987 | else if ('a' <= tc && tc <= 'f') |
|---|
| | 988 | c = c * 16 + (tc - 'a' + 10); |
|---|
| | 989 | else if ('A' <= tc && tc <= 'F') |
|---|
| | 990 | c = c * 16 + (tc - 'A' + 10); |
|---|
| | 991 | else if (i == 0) // if no hex digits after \x |
|---|
| | 992 | { |
|---|
| | 993 | // Not a valid \xXX sequence |
|---|
| | 994 | return 'x'; |
|---|
| | 995 | } |
|---|
| | 996 | else |
|---|
| | 997 | return c; |
|---|
| | 998 | } |
|---|
| | 999 | break; |
|---|
| | 1000 | |
|---|
| | 1001 | case 'u': |
|---|
| | 1002 | c = 0; |
|---|
| | 1003 | for (i = 0; i < 4; i++) |
|---|
| | 1004 | { |
|---|
| | 1005 | p++; |
|---|
| | 1006 | if (p == pattern.length) |
|---|
| | 1007 | goto Lretc; |
|---|
| | 1008 | tc = pattern[p]; |
|---|
| | 1009 | if ('0' <= tc && tc <= '9') |
|---|
| | 1010 | c = c * 16 + (tc - '0'); |
|---|
| | 1011 | else if ('a' <= tc && tc <= 'f') |
|---|
| | 1012 | c = c * 16 + (tc - 'a' + 10); |
|---|
| | 1013 | else if ('A' <= tc && tc <= 'F') |
|---|
| | 1014 | c = c * 16 + (tc - 'A' + 10); |
|---|
| | 1015 | else |
|---|
| | 1016 | { |
|---|
| | 1017 | // Not a valid \uXXXX sequence |
|---|
| | 1018 | p -= i; |
|---|
| | 1019 | return 'u'; |
|---|
| | 1020 | } |
|---|
| | 1021 | } |
|---|
| | 1022 | break; |
|---|
| | 1023 | |
|---|
| | 1024 | default: |
|---|
| | 1025 | break; |
|---|
| | 1026 | } |
|---|
| | 1027 | p++; |
|---|
| | 1028 | Lretc: |
|---|
| | 1029 | return c; |
|---|
| | 1030 | } |
|---|
| | 1031 | |
|---|
| | 1032 | //private: |
|---|
| | 1033 | class Range |
|---|
| | 1034 | { |
|---|
| | 1035 | uint maxc; |
|---|
| | 1036 | uint maxb; |
|---|
| | 1037 | OutBuffer buf; |
|---|
| | 1038 | ubyte* base; |
|---|
| | 1039 | BitArray bits; |
|---|
| | 1040 | |
|---|
| | 1041 | this(OutBuffer buf) |
|---|
| | 1042 | { |
|---|
| | 1043 | this.buf = buf; |
|---|
| | 1044 | if (buf.data.length) |
|---|
| | 1045 | this.base = &buf.data[buf.offset]; |
|---|
| | 1046 | } |
|---|
| | 1047 | |
|---|
| | 1048 | void setbitmax(uint u) |
|---|
| | 1049 | { uint b; |
|---|
| | 1050 | |
|---|
| | 1051 | //printf("setbitmax(x%x), maxc = x%x\n", u, maxc); |
|---|
| | 1052 | if (u > maxc) |
|---|
| | 1053 | { |
|---|
| | 1054 | maxc = u; |
|---|
| | 1055 | b = u / 8; |
|---|
| | 1056 | if (b >= maxb) |
|---|
| | 1057 | { uint u2; |
|---|
| | 1058 | |
|---|
| |
|---|