Changeset 429

Show
Ignore:
Timestamp:
01/13/02 21:52:05
Author:
miyagawa
Message:

rewrite

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Email-Find/trunk/Changes

    r276 r429  
    11Revision history for Perl extension Email::Find. 
     2 
     30.09 
     4    * Complete rewrite of the module: added new OO interface 
    25 
    360.08  Fri Oct  5 08:35:25 JST 2001 
  • Email-Find/trunk/MANIFEST

    r276 r429  
    44README 
    55lib/Email/Find.pm 
     6lib/Email/Find/addrspec.pm 
    67t/Find.t 
    78t/RFC822.t 
     9t/addr-spec.t 
     10t/new-api.t 
    811t/rfc822.txt 
  • Email-Find/trunk/Makefile.PL

    r276 r429  
    1515$PACKAGE = 'Email::Find'; 
    1616my($PACKAGE_FILE) = $PACKAGE =~ /(?:\::)?([^:]+)$/; 
    17 $LAST_API_CHANGE = 0
     17$LAST_API_CHANGE = 0.09
    1818 
    1919eval "require $PACKAGE"; 
  • Email-Find/trunk/README

    r207 r429  
    11NAME 
    2       Email::Find - Find RFC 822 email addresses in plain text 
     2    Email::Find - Find RFC 822 email addresses in plain text 
    33 
    44SYNOPSIS 
    55      use Email::Find; 
     6 
     7      # new object oriented interface 
     8      my $finder = Email::Find->new(\&callback); 
     9      my $num_found - $finder->find(\$text); 
     10 
     11      # good old functional style 
    612      $num_found = find_emails($text, \&callback); 
    713 
    814DESCRIPTION 
    9     This is a module for finding a *subset* of RFC 822 email addresses in 
    10     arbitrary text (the CAVEATS manpage). The addresses it finds are not 
    11     guaranteed to exist or even actually be email addresses at all (the 
    12     CAVEATS manpage), but they will be valid RFC 822 syntax. 
     15    Email::Find is a module for finding a *subset* of RFC 822 email 
     16    addresses in arbitrary text (see the section on "CAVEATS"). The 
     17    addresses it finds are not guaranteed to exist or even actually be email 
     18    addresses at all (see the section on "CAVEATS"), but they will be valid 
     19    RFC 822 syntax. 
    1320 
    1421    Email::Find will perform some heuristics to avoid some of the more 
     
    1623    can be done without a human. 
    1724 
    18   Functions 
     25METHODS 
     26    new 
     27          $finder = Email::Find->new(\&callback); 
    1928 
    20     Email::Find exports one function, find_emails(). It works very similar 
    21     to URI::Find's find_uris()
     29        Constructs new Email::Find object. Specified callback will be called 
     30        with each email as they're found
    2231 
    23       $num_emails_found = find_emails($text, \&callback); 
     32    find 
     33          $num_emails_found = $finder->find(\$text); 
    2434 
    25     The first argument is a block of text for find_emails to search through 
    26     and manipulate. Second is a callback routine which defines what to do 
    27     with each email as they're found. It returns the total number of emails 
    28     found. 
     35        Finds email addresses in the text and executes callback registered. 
    2936 
    30     The callback is given two arguments. The first is a Mail::Address object 
    31     representing the address found. The second is the actual original email 
    32     as found in the text. Whatever the callback returns will replace the 
    33     original text. 
     37        The callback is given two arguments. The first is a Mail::Address 
     38        object representing the address found. The second is the actual 
     39        original email as found in the text. Whatever the callback returns 
     40        will replace the original text. 
     41 
     42FUNCTIONS 
     43        For backward compatibility, Email::Find exports one function, 
     44        find_emails(). It works very similar to URI::Find's find_uris(). 
    3445 
    3546EXAMPLES 
    36       # Simply print out all the addresses found leaving the text undisturbed. 
    37       find_emails($text, sub { 
    38                              my($email, $orig_email) = @_; 
    39                              print "Found ".$email->format."\n"; 
    40                              return $orig_email; 
    41                          }); 
     47          use Email::Find; 
    4248 
    43       # For each email found, ping its host to see if its alive. 
    44       require Net::Ping; 
    45       $ping = Net::Ping->new; 
    46       my %Pinged = (); 
    47       find_emails($text, sub { 
    48                              my($email, $orig_email) = @_; 
    49                              my $host = $email->host; 
    50                              next if exists $Pinged{$host}; 
    51                              $Pinged{$host} = $ping->ping($host); 
    52                          }); 
     49          # Simply print out all the addresses found leaving the text undisturbed. 
     50          my $finder = Email::Find->new(sub { 
     51                                            my($email, $orig_email) = @_; 
     52                                            print "Found ".$email->format."\n"; 
     53                                            return $orig_email; 
     54                                        }); 
     55          $finder->find(\$text); 
    5356 
    54       while( my($host, $up) = each %Pinged ) { 
    55           print "$host is ". $up ? 'up' : 'down' ."\n"; 
    56       } 
     57          # For each email found, ping its host to see if its alive. 
     58          require Net::Ping; 
     59          $ping = Net::Ping->new; 
     60          my %Pinged = (); 
     61          my $finder = Email::Find->new(sub { 
     62                                            my($email, $orig_email) = @_; 
     63                                            my $host = $email->host; 
     64                                            next if exists $Pinged{$host}; 
     65                                            $Pinged{$host} = $ping->ping($host); 
     66                                        }); 
    5767 
    58       # Count how many addresses are found. 
    59       print "Found ", find_emails($text, sub { return $_[1] }), " addresses\n"; 
     68          $finder->find(\$text); 
    6069 
    61       # Wrap each address in an HTML mailto link. 
    62       find_emails($text, sub { 
    63                              my($email, $orig_email) = @_; 
    64                              my($address) = $email->format; 
    65                              return qq|<a href="mailto:$address">$orig_email</a>|; 
    66                          }); 
     70          while( my($host, $up) = each %Pinged ) { 
     71              print "$host is ". $up ? 'up' : 'down' ."\n"; 
     72          } 
     73 
     74          # Count how many addresses are found. 
     75          my $finder = Email::Find->new(sub { $_[1] }); 
     76          print "Found ", $finder->find(\$text), " addresses\n"; 
     77 
     78          # Wrap each address in an HTML mailto link. 
     79          my $finder = Email::Find->new( 
     80              sub { 
     81                  my($email, $orig_email) = @_; 
     82                  my($address) = $email->format; 
     83                  return qq|<a href="mailto:$address">$orig_email</a>|; 
     84              }, 
     85          ); 
     86          $finder->find(\$text); 
     87 
     88SUBCLASSING 
     89        If you want to change the way this module works in finding email 
     90        address, you can do it by making your subclass of Email::Find, which 
     91        overrides "addr_regex" and "do_validate" method. 
     92 
     93        For example, the following class can additionally find email 
     94        addresses with dot before at mark. This is illegal in RFC822, see 
     95        the Email::Valid::Loose manpage for details. 
     96 
     97          package Email::Find::Loose; 
     98          use base qw(Email::Find); 
     99          use Email::Valid::Loose; 
     100 
     101          # should return regex, which Email::Find will use in finding 
     102          # strings which are "thought to be" email addresses 
     103          sub addr_regex { 
     104              return $Email::Valid::Loose::Addr_spec_re; 
     105          } 
     106 
     107          # should validate $addr is a valid email or not. 
     108          # if so, return the address as a string. 
     109          # else, return undef 
     110          sub do_validate { 
     111              my($self, $addr) = @_; 
     112              return Email::Valid::Loose->address($addr); 
     113          } 
     114 
     115        Let's see another example, which validates if the address is an 
     116        existent one or not, with Mail::CheckUser module. 
     117 
     118          package Email::Find::Existent; 
     119          use base qw(Email::Find); 
     120          use Mail::CheckUser qw(check_email); 
     121 
     122          sub do_validate { 
     123              my($self, $addr) = @_; 
     124              return check_email($addr) ? $addr : undef; 
     125          } 
    67126 
    68127CAVEATS 
    69     Why a subset of RFC 822? 
    70         I say that this module finds a *subset* of RFC 822 because if I 
    71         attempted to look for *all* possible valid RFC 822 addresses I'd 
    72         wind up practically matching the entire block of text! The complet
    73         specification is so wide open that its difficult to construct 
    74         soemthing that's *not* an RFC 822 address. 
     128        Why a subset of RFC 822? 
     129            I say that this module finds a *subset* of RFC 822 because if I 
     130            attempted to look for *all* possible valid RFC 822 addresses I'd 
     131            wind up practically matching the entire block of text! Th
     132            complete specification is so wide open that its difficult to 
     133            construct soemthing that's *not* an RFC 822 address. 
    75134 
    76         To keep myself sane, I look for the 'address spec' or 'global 
    77         address' part of an RFC 822 address. This is the part which most 
    78         people consider to be an email address (the 'foo@bar.com' part) and 
    79         it is also the part which contains the information necessary for 
    80         delivery. 
     135            To keep myself sane, I look for the 'address spec' or 'global 
     136            address' part of an RFC 822 address. This is the part which most 
     137            people consider to be an email address (the 'foo@bar.com' part) 
     138            and it is also the part which contains the information necessary 
     139            for delivery. 
    81140 
    82     Why are some of the matches not email addresses? 
    83         Alas, many things which aren't email addresses *look* like email 
    84         addresses and parse just fine as them. The biggest headache is email 
    85         and usenet and email message IDs. I do my best to avoid them, but 
    86         there's only so much cleverness you can pack into one library. 
     141        Why are some of the matches not email addresses? 
     142            Alas, many things which aren't email addresses *look* like email 
     143            addresses and parse just fine as them. The biggest headache is 
     144            email and usenet and email message IDs. I do my best to avoid 
     145            them, but there's only so much cleverness you can pack into one 
     146            library. 
    87147 
    88148AUTHORS 
    89     Copyright 2000, 2001 Michael G Schwern <schwern@pobox.com>. All rights 
    90     reserved. 
     149        Copyright 2000, 2001 Michael G Schwern <schwern@pobox.com>. All 
     150        rights reserved. 
    91151 
    92     Current maintainer is Tatsuhiko Miyagawa <miyagawa@bulknews.net>. 
     152        Current maintainer is Tatsuhiko Miyagawa <miyagawa@bulknews.net>. 
    93153 
    94154THANKS 
    95     Schwern thanks to Jeremy Howard for his patch to make it work under 
    96     5.005. 
     155        Schwern thanks to Jeremy Howard for his patch to make it work under 
     156        5.005. 
    97157 
    98158LICENSE 
    99     This module is free software; you may redistribute it and/or modify it 
    100     under the same terms as Perl itself. 
     159        This module is free software; you may redistribute it and/or modify 
     160        it under the same terms as Perl itself. 
    101161 
    102     The author STRONGLY SUGGESTS that this module not be used for the 
    103     purposes of sending unsolicited email (ie. spamming) in any way, shape 
    104     or form or for the purposes of generating lists for commercial sale. 
     162        The author STRONGLY SUGGESTS that this module not be used for the 
     163        purposes of sending unsolicited email (ie. spamming) in any way, 
     164        shape or form or for the purposes of generating lists for commercial 
     165        sale. 
    105166 
    106     If you use this module for spamming I reserve the right to make fun of 
    107     you. 
     167        If you use this module for spamming I reserve the right to make fun 
     168        of you. 
    108169 
    109170SEE ALSO 
    110     the Email::Valid manpage, RFC 822, the URI::Find manpage, the 
    111     Apache::AntiSpam manpage 
     171        the Email::Valid manpage, RFC 822, the URI::Find manpage, the 
     172        Apache::AntiSpam manpage, the Email::Valid::Loose manpage 
    112173 
  • Email-Find/trunk/lib/Email/Find.pm

    r276 r429  
    33use strict; 
    44use vars qw($VERSION @EXPORT); 
    5 $VERSION = '0.08'
     5$VERSION = 0.09
    66 
    77# Need qr//. 
     
    1212 
    1313use Email::Valid; 
    14 require Mail::Address; 
    15  
    16 # This is the BNF from RFC 822 
    17 my $esc         = '\\\\';               my $period      = '\.'; 
    18 my $space       = '\040'; 
    19 my $open_br     = '\[';                 my $close_br    = '\]'; 
    20 my $nonASCII    = '\x80-\xff';          my $ctrl        = '\000-\037'; 
    21 my $cr_list     = '\n\015'; 
    22 my $qtext       = qq/[^$esc$nonASCII$cr_list\"]/; 
    23 my $dtext       = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/; 
    24 my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>; 
    25 my $atom_char   = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/; 
    26 my $atom        = qq<$atom_char+(?!$atom_char)>; 
    27 my $quoted_str  = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; 
    28 my $word        = qq<(?:$atom|$quoted_str)>; 
    29 my $local_part  = qq<$word(?:$period$word)*>;   #" for emacs 
    30  
    31 # This is a combination of the domain name BNF from RFC 1035 plus the 
    32 # domain literal definition from RFC 822, but allowing domains starting 
    33 # with numbers. 
    34 my $label       = q/[A-Za-z\d](?:[A-Za-z\d-]*[A-Za-z\d])?/; 
    35 my $domain_ref  = qq<$label(?:$period$label)*>; 
    36 my $domain_lit  = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>; 
    37 my $domain      = qq<(?:$domain_ref|$domain_lit)>; 
    38  
    39  
    40 # Finally, the address-spec regex (more or less) 
    41 use vars qw($Addr_spec_re); 
    42 $Addr_spec_re   = qr<$local_part\s*\@\s*$domain>; 
    43  
    44  
    45  
    46 my $validator = Email::Valid->new('-fudge'      => 0, 
    47                                   '-fqdn'       => 1, 
    48                                   '-local_rules' => 0, 
    49                                   '-mxcheck'    => 0, 
    50                                  ); 
    51  
    52 sub find_emails (\$&) { 
     14use Email::Find::addrspec; 
     15use Mail::Address; 
     16 
     17sub addr_regex { $Addr_spec_re } 
     18 
     19
     20    my $validator = Email::Valid->new( 
     21        '-fudge'      => 0, 
     22        '-fqdn'       => 1, 
     23        '-local_rules' => 0, 
     24        '-mxcheck'    => 0, 
     25    ); 
     26 
     27    sub do_validate { 
     28        my($self, $addr) = @_; 
     29        $validator->address($addr); 
     30    } 
     31
     32 
     33sub new { 
     34    my($proto, $callback) = @_; 
     35    my $class = ref $proto || $proto; 
     36    bless { callback => $callback }, $class; 
     37
     38 
     39sub find { 
     40    my($self, $r_text) = @_; 
     41 
     42    my $emails_found = 0; 
     43    my $re = $self->addr_regex; 
     44    $$r_text =~ s{($re)}{ 
     45        my($replace, $found) = $self->validate($1); 
     46        $emails_found += $found; 
     47        $replace; 
     48    }eg; 
     49    return $emails_found; 
     50
     51 
     52sub validate { 
     53    my($self, $orig_match) = @_; 
     54 
     55    my $replace; 
     56    my $found = 0; 
     57 
     58    # XXX Add cruft handling. 
     59    my($start_cruft) = ''; 
     60    my($end_cruft)   = ''; 
     61 
     62 
     63    if( $orig_match =~ s|([),.'";?!]+)$|| ) { #"')){ 
     64        $end_cruft = $1; 
     65    } 
     66    if( my $email = $self->do_validate($orig_match) ) { 
     67        $email = Mail::Address->new('', $email); 
     68        $found++; 
     69        $replace = $start_cruft . $self->{callback}->($email, $orig_match) . $end_cruft; 
     70    } 
     71    else { 
     72        # XXX Again with the cruft! 
     73        $replace = $start_cruft . $orig_match . $end_cruft; 
     74    } 
     75    return $replace, $found; 
     76
     77 
     78# backward comaptibility 
     79sub find_emails(\$&) { 
    5380    my($r_text, $callback) = @_; 
    54  
    55     my $emails_found = 0; 
    56  
    57     study($$r_text); 
    58  
    59     $$r_text =~ s{($Addr_spec_re)}{ 
    60         my($orig_match) = $1; 
    61  
    62         # XXX Add cruft handling. 
    63         my($start_cruft) = ''; 
    64         my($end_cruft)   = ''; 
    65         if( $orig_match =~ s|([),.'";?!]+)$|| ) { 
    66             $end_cruft = $1; 
    67         } 
    68  
    69         if( my $email = $validator->address($orig_match) ) { 
    70             $email = Mail::Address->new('', $email); 
    71             $emails_found++; 
    72  
    73             $start_cruft . $callback->($email, $orig_match) . $end_cruft; 
    74         } 
    75         else { 
    76             # XXX Again with the cruft! 
    77  
    78             $start_cruft . $orig_match . $end_cruft; 
    79         } 
    80     }eg; 
    81  
    82     return $emails_found; 
    83 
    84  
    85 return '*@qt.to'; 
    86  
     81    my $finder = __PACKAGE__->new($callback); 
     82    $finder->find($r_text); 
     83
     84 
     851; 
    8786__END__ 
    8887 
     
    9190=head1 NAME 
    9291 
    93   Email::Find - Find RFC 822 email addresses in plain text 
    94  
     92Email::Find - Find RFC 822 email addresses in plain text 
    9593 
    9694=head1 SYNOPSIS 
    9795 
    9896  use Email::Find; 
     97 
     98  # new object oriented interface 
     99  my $finder = Email::Find->new(\&callback); 
     100  my $num_found - $finder->find(\$text); 
     101 
     102  # good old functional style 
    99103  $num_found = find_emails($text, \&callback); 
    100104 
    101  
    102105=head1 DESCRIPTION 
    103106 
    104 This is a module for finding a I<subset> of RFC 822 email addresses in 
    105 arbitrary text (L<CAVEATS>).  The addresses it finds are no
    106 guaranteed to exist or even actually be email addresses at all 
    107 (L<CAVEATS>), but they will be valid RFC 822 syntax. 
     107Email::Find is a module for finding a I<subset> of RFC 822 email 
     108addresses in arbitrary text (see L</"CAVEATS">).  The addresses i
     109finds are not guaranteed to exist or even actually be email addresses 
     110at all (see L</"CAVEATS">), but they will be valid RFC 822 syntax. 
    108111 
    109112Email::Find will perform some heuristics to avoid some of the more 
     
    111114which can be done without a human. 
    112115 
    113  
    114 =head2 Functions 
    115  
    116 Email::Find exports one function, find_emails().  It works very 
    117 similar to URI::Find's find_uris(). 
    118  
    119   $num_emails_found = find_emails($text, \&callback); 
    120  
    121 The first argument is a block of text for find_emails to search 
    122 through and manipulate.  Second is a callback routine which defines 
    123 what to do with each email as they're found.  It returns the total 
    124 number of emails found. 
     116=head1 METHODS 
     117 
     118=over 4 
     119 
     120=item new 
     121 
     122  $finder = Email::Find->new(\&callback); 
     123 
     124Constructs new Email::Find object. Specified callback will be called 
     125with each email as they're found. 
     126 
     127=item find 
     128 
     129  $num_emails_found = $finder->find(\$text); 
     130 
     131Finds email addresses in the text and executes callback registered. 
    125132 
    126133The callback is given two arguments.  The first is a Mail::Address 
     
    129136will replace the original text. 
    130137 
     138=head1 FUNCTIONS 
     139 
     140For backward compatibility, Email::Find exports one function, 
     141find_emails(). It works very similar to URI::Find's find_uris(). 
    131142 
    132143=head1 EXAMPLES 
    133144 
     145  use Email::Find; 
     146 
    134147  # Simply print out all the addresses found leaving the text undisturbed. 
    135   find_emails($text, sub { 
    136                          my($email, $orig_email) = @_; 
    137                          print "Found ".$email->format."\n"; 
    138                          return $orig_email; 
    139                      }); 
    140  
     148  my $finder = Email::Find->new(sub { 
     149                                   my($email, $orig_email) = @_; 
     150                                   print "Found ".$email->format."\n"; 
     151                                   return $orig_email; 
     152                               }); 
     153  $finder->find(\$text); 
    141154 
    142155  # For each email found, ping its host to see if its alive. 
     
    144157  $ping = Net::Ping->new; 
    145158  my %Pinged = (); 
    146   find_emails($text, sub { 
    147                          my($email, $orig_email) = @_; 
    148                          my $host = $email->host; 
    149                          next if exists $Pinged{$host}; 
    150                          $Pinged{$host} = $ping->ping($host); 
    151                      }); 
     159  my $finder = Email::Find->new(sub { 
     160                                    my($email, $orig_email) = @_; 
     161                                    my $host = $email->host; 
     162                                    next if exists $Pinged{$host}; 
     163                                    $Pinged{$host} = $ping->ping($host); 
     164                                }); 
     165 
     166  $finder->find(\$text); 
    152167 
    153168  while( my($host, $up) = each %Pinged ) { 
     
    155170  } 
    156171 
    157  
    158172  # Count how many addresses are found. 
    159   print "Found ", find_emails($text, sub { return $_[1] }), " addresses\n"
    160  
     173  my $finder = Email::Find->new(sub { $_[1] })
     174  print "Found ", $finder->find(\$text), " addresses\n"; 
    161175 
    162176  # Wrap each address in an HTML mailto link. 
    163   find_emails($text, sub { 
    164                          my($email, $orig_email) = @_; 
    165                          my($address) = $email->format; 
    166                          return qq|<a href="mailto:$address">$orig_email</a>|; 
    167                      }); 
    168  
     177  my $finder = Email::Find->new( 
     178      sub { 
     179          my($email, $orig_email) = @_; 
     180          my($address) = $email->format; 
     181          return qq|<a href="mailto:$address">$orig_email</a>|; 
     182      }, 
     183  ); 
     184  $finder->find(\$text); 
     185 
     186=head1 SUBCLASSING 
     187 
     188If you want to change the way this module works in finding email 
     189address, you can do it by making your subclass of Email::Find, which 
     190overrides C<addr_regex> and C<do_validate> method. 
     191 
     192For example, the following class can additionally find email addresses 
     193with dot before at mark. This is illegal in RFC822, see 
     194L<Email::Valid::Loose> for details. 
     195 
     196  package Email::Find::Loose; 
     197  use base qw(Email::Find); 
     198  use Email::Valid::Loose; 
     199 
     200  # should return regex, which Email::Find will use in finding 
     201  # strings which are "thought to be" email addresses 
     202  sub addr_regex { 
     203      return $Email::Valid::Loose::Addr_spec_re; 
     204  } 
     205 
     206  # should validate $addr is a valid email or not. 
     207  # if so, return the address as a string. 
     208  # else, return undef 
     209  sub do_validate { 
     210      my($self, $addr) = @_; 
     211      return Email::Valid::Loose->address($addr); 
     212  } 
     213 
     214Let's see another example, which validates if the address is an 
     215existent one or not, with Mail::CheckUser module. 
     216 
     217  package Email::Find::Existent; 
     218  use base qw(Email::Find); 
     219  use Mail::CheckUser qw(check_email); 
     220 
     221  sub do_validate { 
     222      my($self, $addr) = @_; 
     223      return check_email($addr) ? $addr : undef; 
     224  } 
    169225 
    170226=head1 CAVEATS 
     
    196252=head1 AUTHORS 
    197253 
    198 Copyright 2000, 2001 Michael G Schwern <schwern@pobox.com>. 
     254Copyright 2000, 2001 Michael G Schwern E<lt>schwern@pobox.comE<gt>. 
    199255All rights reserved. 
    200256 
    201 Current maintainer is Tatsuhiko Miyagawa <miyagawa@bulknews.net>. 
     257Current maintainer is Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>. 
    202258 
    203259=head1 THANKS 
     
    225281=head1 SEE ALSO 
    226282 
    227 L<Email::Valid>, RFC 822, L<URI::Find>, L<Apache::AntiSpam> 
     283L<Email::Valid>, RFC 822, L<URI::Find>, L<Apache::AntiSpam>, 
     284L<Email::Valid::Loose> 
    228285 
    229286=cut 
  • Email-Find/trunk/t/Find.t

    r276 r429  
    11use strict; 
    2 use Test::More 'no_plan'; # XXX 
     2use Test::More tests => 23; 
    33BEGIN { use_ok('Email::Find') } 
    44