root/PMTagCloud/bin/aggregate.pl

Revision 2093 (checked in by miyagawa, 14 years ago)

import PMTagCloud

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use FindBin;
5 use LWP::UserAgent;
6 use URI;
7 use URI::Escape;
8 use YAML;
9
10 our $Seed = YAML::Load(<<YAML);
11 abe:
12   - http://www.kantei.go.jp/jp/abespeech/index.html
13 koizumi:
14   - http://www.kantei.go.jp/jp/koizumispeech/index.html
15   - http://www.kantei.go.jp/jp/koizumispeech/2005/index.html
16   - http://www.kantei.go.jp/jp/koizumispeech/2004/index.html
17   - http://www.kantei.go.jp/jp/koizumispeech/2003/index.html
18   - http://www.kantei.go.jp/jp/koizumispeech/2002/index.html
19   - http://www.kantei.go.jp/jp/koizumispeech/2001/index.html
20 mori:
21   - http://www.kantei.go.jp/jp/morisouri/mori_speech/index.html
22 obuti:
23   - http://www.kantei.go.jp/jp/obutisouri/speech/index.html
24 hashimoto:
25   - http://www.kantei.go.jp/jp/hasimotosouri/speech/index.html
26 murayama:
27   - http://www.kantei.go.jp/jp/murayamasouri/speech/index.html
28   - http://www.kantei.go.jp/jp/murayamasouri/danwa/index.html
29 YAML
30
31 our $ua = LWP::UserAgent->new;
32
33 for my $pm (sort keys %$Seed) {
34     aggregate($pm, $Seed->{$pm});
35 }
36
37 sub aggregate {
38     my($pm, $urls) = @_;
39
40     for my $url (@$urls) {
41         my @links = find_links($url);
42         for my $link (@links) {
43             fetch_story($pm, $url, $link);
44         }
45     }
46 }
47
48 sub find_links {
49     my($url) = @_;
50
51     warn "find links from $url";
52
53     my $html = $ua->get($url)->content;
54     my @links = $html =~ m!<a href="([^/][^"]+\.html)">!gi;
55
56     return grep !/index\.html/, @links;
57 }
58
59 sub fetch_story {
60     my($pm, $base, $link) = @_;
61
62     my $dir  = File::Spec->catfile($FindBin::Bin, "..", 'data', $pm);
63     mkdir $dir, 0777 unless -e $dir;
64
65     my $url = URI->new_abs($link, $base);
66
67     (my $fn = $url->as_string) =~ s!.*speech/!!;
68     my $path = File::Spec->catfile($dir, uri_escape($fn));
69
70     warn "mirroring $url";
71
72     $ua->mirror($url, $path);
73 }
Note: See TracBrowser for help on using the browser.