1  #
  2  # File:  email-opt.pl
  3  # 
  4  # Program to build a regex to match an internet email address,
  5  # from Chapter 7 of _Mastering Regular Expressions_ (Friedl / O'Reilly)
  6  # (http://www.ora.com/catalog/regexp/)
  7  #
  8  # Optimized version.
  9  #
 10  # Copyright 1997 O'Reilly & Associates, Inc.
 11  #
 12  
 13  package email;
 14  
 15  # Some things for avoiding backslashitis later on.
 16  $esc        = '\\\\';               $Period      = '\.';
 17  $space      = '\040';               $tab         = '\t';
 18  $OpenBR     = '\[';                 $CloseBR     = '\]';
 19  $OpenParen  = '\(';                 $CloseParen  = '\)';
 20  $NonASCII   = '\x80-\xff';          $ctrl        = '\000-\037';
 21  $CRlist     = '\n\015';  # note: this should really be only \015.
 22  
 23  # Items 19, 20, 21
 24  $qtext = qq/[^$esc$NonASCII$CRlist\"]/;               # for within "..."
 25  $dtext = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/;  # for within [...]
 26  $quoted_pair = qq< $esc [^$NonASCII] >; # an escaped character
 27  
 28  ##############################################################################
 29  # Items 22 and 23, comment.
 30  # Impossible to do properly with a regex, I make do by allowing at most one level of nesting.
 31  $ctext   = qq< [^$esc$NonASCII$CRlist()] >;
 32  
 33  # $Cnested matches one non-nested comment.
 34  # It is unrolled, with normal of $ctext, special of $quoted_pair.
 35  $Cnested = qq<
 36     $OpenParen                            #  (
 37        $ctext*                            #     normal*
 38        (?: $quoted_pair $ctext* )*        #     (special normal*)*
 39     $CloseParen                           #                       )
 40  >;
 41  
 42  # $comment allows one level of nested parentheses
 43  # It is unrolled, with normal of $ctext, special of ($quoted_pair|$Cnested)
 44  $comment = qq<
 45     $OpenParen                              #  (
 46         $ctext*                             #     normal*
 47         (?:                                 #       (
 48            (?: $quoted_pair | $Cnested )    #         special
 49             $ctext*                         #         normal*
 50         )*                                  #            )*
 51     $CloseParen                             #                )
 52  >;
 53  
 54  ##############################################################################
 55  
 56  # $X is optional whitespace/comments.
 57  $X = qq<
 58     [$space$tab]*                    # Nab whitespace.
 59     (?: $comment [$space$tab]* )*    # If comment found, allow more spaces.
 60  >;
 61  
 62  
 63  
 64  # Item 10: atom
 65  $atom_char   = qq/[^($space)<>\@,;:\".$esc$OpenBR$CloseBR$ctrl$NonASCII]/;
 66  $atom = qq<
 67    $atom_char+    # some number of atom characters...
 68    (?!$atom_char) # ..not followed by something that could be part of an atom
 69  >;
 70  
 71  # Item 11: doublequoted string, unrolled.
 72  $quoted_str = qq<
 73      \"                                     # "
 74         $qtext *                            #   normal
 75         (?: $quoted_pair $qtext * )*        #   ( special normal* )*
 76      \"                                     #        "
 77  >;
 78  
 79  # Item 7: word is an atom or quoted string
 80  $word = qq<
 81      (?:
 82         $atom                 # Atom
 83         |                       #  or
 84         $quoted_str           # Quoted string
 85       )
 86  >;
 87  
 88  # Item 12: domain-ref is just an atom
 89  $domain_ref  = $atom;
 90  
 91  # Item 13: domain-literal is like a quoted string, but [...] instead of  "..."
 92  $domain_lit  = qq<
 93      $OpenBR                            # [
 94      (?: $dtext | $quoted_pair )*     #    stuff
 95      $CloseBR                           #           ]
 96  >;
 97  
 98  # Item 9: sub-domain is a domain-ref or domain-literal
 99  $sub_domain  = qq<
100    (?:
101      $domain_ref
102      |
103      $domain_lit
104     )
105     $X # optional trailing comments
106  >;
107  
108  # Item 6: domain is a list of subdomains separated by dots.
109  $domain = qq<
110       $sub_domain
111       (?:
112          $Period $X $sub_domain
113       )*
114  >;
115  
116  # Item 8: a route. A bunch of "@ $domain" separated by commas, followed by a colon.
117  $route = qq<
118      \@ $X $domain
119      (?: , $X \@ $X $domain )*  # additional domains
120      :
121      $X # optional trailing comments
122  >;
123  
124  # Item 6: local-part is a bunch of $word separated by periods
125  $local_part = qq<
126      $word $X
127      (?:
128          $Period $X $word $X # additional words
129      )*
130  >;
131  
132  # Item 2: addr-spec is local@domain
133  $addr_spec  = qq<
134    $local_part \@ $X $domain
135  >;
136  
137  # Item 4: route-addr is <route? addr-spec>
138  $route_addr = qq[
139      < $X                 # <
140         (?: $route )?     #       optional route
141         $addr_spec        #       address spec
142      >                    #                 >
143  ];
144  
145  
146  # Item 3: phrase........
147  $phrase_ctrl = '\000-\010\012-\037'; # like ctrl, but without tab
148  
149  # Like atom-char, but without listing space, and uses phrase_ctrl.
150  # Since the class is negated, this matches the same as atom-char plus space and tab
151  $phrase_char =
152     qq/[^()<>\@,;:\".$esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/;
153  
154  # We've worked it so that $word, $comment, and $quoted_str to not consume trailing $X
155  # because we take care of it manually.
156  $phrase = qq<
157     $word                        # leading word
158     $phrase_char *               # "normal" atoms and/or spaces
159     (?:
160        (?: $comment | $quoted_str ) # "special" comment or quoted string
161        $phrase_char *            #  more "normal"
162     )*
163  >;
164  
165  ## Item #1: mailbox is an addr_spec or a phrase/route_addr
166  $mailbox = qq<
167      $X                                  # optional leading comment
168      (?:
169              $addr_spec                  # address
170              |                             #  or
171              $phrase  $route_addr      # name and address
172       )
173  >;
174  
175  
176  ###########################################################################
177  # Here's a little snippet to test it.
178  # Addresses given on the commandline are described.
179  #
180  
181  # my $error = 0;
182  # my $valid;
183  # foreach $address (@ARGV) {
184  #     $valid = $address =~ m/^$mailbox$/xo;
185  #     printf "`$address' is syntactically %s.\n", $valid ? "valid" : "invalid";
186  #     $error = 1 if not $valid;
187  # }
188  # exit $error;
189  
190