1 # 2 # File: email-opt.pl 3 # 4 # Program to build a regex to match an internet email address, 5 # from Chapter 7 of _Mastering Regular Expressions_ (Friedl / O'Reilly) 6 # (http://www.ora.com/catalog/regexp/) 7 # 8 # Optimized version. 9 # 10 # Copyright 1997 O'Reilly & Associates, Inc. 11 # 12 13 package email; 14 15 # Some things for avoiding backslashitis later on. 16 $esc = '\\\\'; $Period = '\.'; 17 $space = '\040'; $tab = '\t'; 18 $OpenBR = '\['; $CloseBR = '\]'; 19 $OpenParen = '\('; $CloseParen = '\)'; 20 $NonASCII = '\x80-\xff'; $ctrl = '\000-\037'; 21 $CRlist = '\n\015'; # note: this should really be only \015. 22 23 # Items 19, 20, 21 24 $qtext = qq/[^$esc$NonASCII$CRlist\"]/; # for within "..." 25 $dtext = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/; # for within [...] 26 $quoted_pair = qq< $esc [^$NonASCII] >; # an escaped character 27 28 ############################################################################## 29 # Items 22 and 23, comment. 30 # Impossible to do properly with a regex, I make do by allowing at most one level of nesting. 31 $ctext = qq< [^$esc$NonASCII$CRlist()] >; 32 33 # $Cnested matches one non-nested comment. 34 # It is unrolled, with normal of $ctext, special of $quoted_pair. 35 $Cnested = qq< 36 $OpenParen # ( 37 $ctext* # normal* 38 (?: $quoted_pair $ctext* )* # (special normal*)* 39 $CloseParen # ) 40 >; 41 42 # $comment allows one level of nested parentheses 43 # It is unrolled, with normal of $ctext, special of ($quoted_pair|$Cnested) 44 $comment = qq< 45 $OpenParen # ( 46 $ctext* # normal* 47 (?: # ( 48 (?: $quoted_pair | $Cnested ) # special 49 $ctext* # normal* 50 )* # )* 51 $CloseParen # ) 52 >; 53 54 ############################################################################## 55 56 # $X is optional whitespace/comments. 57 $X = qq< 58 [$space$tab]* # Nab whitespace. 59 (?: $comment [$space$tab]* )* # If comment found, allow more spaces. 60 >; 61 62 63 64 # Item 10: atom 65 $atom_char = qq/[^($space)<>\@,;:\".$esc$OpenBR$CloseBR$ctrl$NonASCII]/; 66 $atom = qq< 67 $atom_char+ # some number of atom characters... 68 (?!$atom_char) # ..not followed by something that could be part of an atom 69 >; 70 71 # Item 11: doublequoted string, unrolled. 72 $quoted_str = qq< 73 \" # " 74 $qtext * # normal 75 (?: $quoted_pair $qtext * )* # ( special normal* )* 76 \" # " 77 >; 78 79 # Item 7: word is an atom or quoted string 80 $word = qq< 81 (?: 82 $atom # Atom 83 | # or 84 $quoted_str # Quoted string 85 ) 86 >; 87 88 # Item 12: domain-ref is just an atom 89 $domain_ref = $atom; 90 91 # Item 13: domain-literal is like a quoted string, but [...] instead of "..." 92 $domain_lit = qq< 93 $OpenBR # [ 94 (?: $dtext | $quoted_pair )* # stuff 95 $CloseBR # ] 96 >; 97 98 # Item 9: sub-domain is a domain-ref or domain-literal 99 $sub_domain = qq< 100 (?: 101 $domain_ref 102 | 103 $domain_lit 104 ) 105 $X # optional trailing comments 106 >; 107 108 # Item 6: domain is a list of subdomains separated by dots. 109 $domain = qq< 110 $sub_domain 111 (?: 112 $Period $X $sub_domain 113 )* 114 >; 115 116 # Item 8: a route. A bunch of "@ $domain" separated by commas, followed by a colon. 117 $route = qq< 118 \@ $X $domain 119 (?: , $X \@ $X $domain )* # additional domains 120 : 121 $X # optional trailing comments 122 >; 123 124 # Item 6: local-part is a bunch of $word separated by periods 125 $local_part = qq< 126 $word $X 127 (?: 128 $Period $X $word $X # additional words 129 )* 130 >; 131 132 # Item 2: addr-spec is local@domain 133 $addr_spec = qq< 134 $local_part \@ $X $domain 135 >; 136 137 # Item 4: route-addr is <route? addr-spec> 138 $route_addr = qq[ 139 < $X # < 140 (?: $route )? # optional route 141 $addr_spec # address spec 142 > # > 143 ]; 144 145 146 # Item 3: phrase........ 147 $phrase_ctrl = '\000-\010\012-\037'; # like ctrl, but without tab 148 149 # Like atom-char, but without listing space, and uses phrase_ctrl. 150 # Since the class is negated, this matches the same as atom-char plus space and tab 151 $phrase_char = 152 qq/[^()<>\@,;:\".$esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/; 153 154 # We've worked it so that $word, $comment, and $quoted_str to not consume trailing $X 155 # because we take care of it manually. 156 $phrase = qq< 157 $word # leading word 158 $phrase_char * # "normal" atoms and/or spaces 159 (?: 160 (?: $comment | $quoted_str ) # "special" comment or quoted string 161 $phrase_char * # more "normal" 162 )* 163 >; 164 165 ## Item #1: mailbox is an addr_spec or a phrase/route_addr 166 $mailbox = qq< 167 $X # optional leading comment 168 (?: 169 $addr_spec # address 170 | # or 171 $phrase $route_addr # name and address 172 ) 173 >; 174 175 176 ########################################################################### 177 # Here's a little snippet to test it. 178 # Addresses given on the commandline are described. 179 # 180 181 # my $error = 0; 182 # my $valid; 183 # foreach $address (@ARGV) { 184 # $valid = $address =~ m/^$mailbox$/xo; 185 # printf "`$address' is syntactically %s.\n", $valid ? "valid" : "invalid"; 186 # $error = 1 if not $valid; 187 # } 188 # exit $error; 189 190