root/PMTagCloud/bin/summarize.pl

Revision 2093 (checked in by miyagawa, 14 years ago)

import PMTagCloud

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use utf8;
4 use warnings;
5 use DateTime;
6 use Encode;
7 use File::Find::Rule;
8 use File::Spec;
9 use FindBin;
10 use HTML::TreeBuilder;
11 use Encode::Detect::Detector;
12 use Text::MeCab;
13 use YAML;
14
15 our $MecabEncoding = "euc-jp";
16
17 my $base = File::Spec->catfile($FindBin::Bin, "..", "data");
18 my @files = File::Find::Rule->file->name('*.html')->in($base);
19
20 mkdir "$base/dump", 0777 unless -e "$base/dump";
21
22 my %data;
23 for my $file (@files) {
24     my($pm, $path) = (my $f = $file) =~ m!^$base/(\w+)/(.*?)$!;
25
26     warn "reading $file";
27     my $enc = guess_enc($file);
28     open my $fh, "<:encoding($enc)", $file;
29
30     my $tree = HTML::TreeBuilder->new;
31     $tree->parse_file($fh);
32
33     my $text = $tree->as_text;
34     my $date = find_date($path, $text) or next;
35     next if $date->year < 1900;
36
37     $text = encode($MecabEncoding, $text);
38
39     my $key = join "-", $pm, $date->year;
40
41     my $mecab = Text::MeCab->new;
42     for (my $node = $mecab->parse($text); $node; $node = $node->next) {
43         my $feature = decode($MecabEncoding, $node->feature);
44         my $surface = decode($MecabEncoding, $node->surface);
45         if ($feature =~ /^名詞/ && $feature !~ /(非自立|代名詞|数|接尾)/) {
46             $data{$key}{$surface}++;
47         }
48     }
49 }
50
51 for my $key (keys %data) {
52     YAML::DumpFile("$base/dump/$key.yml", $data{$key});
53 }
54
55 sub guess_enc {
56     my $file = shift;
57
58     open my $fh, "<", $file or die $!;
59     my $data = join '', <$fh>;
60
61     Encode::Detect::Detector::detect($data);
62 }
63
64 sub find_date {
65     my($path, $text) = @_;
66
67     if ($path =~ /(\d{4})(?:%2F)?(\d\d?)(?:%2F)?(\d{2})/i) {
68         if ($2 <= 12) {
69             return DateTime->new(year => $1, month => $2, day => $3);
70         }
71     }
72
73     if ($path =~ /(\d{4})(?:%2F)?.*-(\d\d?)(\d{2})\.html/i) {
74         if ($2 <= 12) {
75             return DateTime->new(year => $1, month => $2, day => $3);
76         }
77     }
78
79     if ($text =~ /平成(\d+)年(\d+)月(\d+)日/) {
80         return DateTime->new(year => 1988 + norm($1), month => norm($2), day => norm($3));
81     }
82
83     if ($path =~ /(19\d{2})/) {
84         return DateTime->new(year => $1, month => 1, day => 1);
85     }
86 }
87
88 sub norm {
89     my $str = shift;
90     $str =~ tr/0-9/0-9/;
91     return $str;
92 }
Note: See TracBrowser for help on using the browser.