use vars qw(@day_names); my $pinni_title = "(TaY) Café Pinni"; my $bio_title = "(TAYS) Bio"; my $kliininen_title = "(TAYS) Arvo"; my $kliininen_fusion_title = "(TAYS) Arvo Fusion Kitchen"; my $zip_salaattibaari_title = "(TTY) Zip Salaattibaari"; my @restaurant_info = ( [ "(TaY) Yliopiston Ravintola", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY__Yliopiston_Ravintola", "M", "left" ], [ "(TaY) Yliopiston Ravintola / Salaattibaari", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY__Yliopiston_Ravintola/Salaattibaari", "", "left" ], [ "(TaY) Fusion Kitchen", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY__Yliopiston_Ravintola/Fusion_Kitchen", "", "left" ], [ $pinni_title, "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY__Caf__Pinni", "M", "middle" ], [ $bio_title, "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY_Kauppi__Medica_Bio", "M", "left" ], [ $kliininen_title, "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY_Kauppi__Medica_Arvo", "M", "left" ], [ $kliininen_fusion_title, "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TaY_Kauppi__Medica_Arvo/Fusion_Kitchen", "M", "left" ], [ "(TTY) Newton", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Newton", "", "left" ], [ "(TTY) Zip", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Zip", "", "right" ], [ "(TTY) Edison", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Edison", "", "middle" ], [ $zip_salaattibaari_title, "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Zip/Salaattibaari", "", "right" ], [ "(TTY) Pastabaari", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Caf____Fast_Voltti/Pastabaari", "", "middle" ], [ "(TTY) Fast Voltti", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Caf____Fast_Voltti", "", "middle" ], [ "(TTY) Fusion Kitchen", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TTY__Newton/Fusion_Kitchen", "", "left" ], [ "(TAMK) Dot", "http://www.juvenes.fi/Suomeksi/Ravintolat_ja_kahvilat/Opiskelijaravintolat/_TAMK__Dot__Ziberia_", "", "middle" ] ); my @restaurants; my ($parse_func, $week, $open_hours, $day_id, $cur_title); my ($cur_food, @cur_day_foods, @week_foods); sub parse_to_eof { } sub parse_skip_to_end_of_div { my $token = shift; if ($token->[0] eq 'E' && $token->[1] eq 'div') { $parse_func = \&parse_more_food; } } sub parse_open_hours_begin { my $token = shift; if ($token->[0] eq 'T') { if ($token->[1] =~ /^Aukiolo/) { $parse_func = \&parse_open_hours_end; } } } sub parse_open_hours_end { my $token = shift; if ($token->[0] eq 'T') { my $text = $token->[1]; if ($text eq 'Erityisruokavaliot') { $parse_func = \&parse_to_eof; } else { $text =~ s/\n//g; $text =~ s/ +$//; $open_hours .= "$text\n" if ($text ne ""); } } } sub finish_food { chomp $cur_food; if ($cur_food =~ /Liha paniini.*tai Kasvis paniini/i && $cur_title eq $pinni_title) { # you get this every day, ignore } else { push @cur_day_foods, $cur_food if ($cur_food ne ""); } $cur_food = ""; } sub finish_day { push @week_foods, [@cur_day_foods]; @cur_day_foods = (); $day_id = $day_id + 1; } sub parse_more_food { my $token = shift; if ($token->[0] eq 'S') { my %attrs = %{$token->[2]}; if ($token->[1] eq 'div') { if ($attrs{'style'} =~ /display: *none/) { # infobox, skip $parse_func = \&parse_skip_to_end_of_div; } elsif ($attrs{'class'} eq 'Column') { # end of food finish_food(); finish_day(); $parse_func = \&parse_open_hours_begin; } } elsif ($token->[1] eq 'br') { if ($br_is_new_food) { finish_food(); } else { $cur_food .= "\n" if ($cur_food ne "" && substr($cur_food, -1) ne "\n"); } } } elsif ($token->[0] eq 'T') { my $text = $token->[1]; if ($day_id < 6 && $text eq $day_names[$day_id+1]) { # day changed finish_food(); finish_day(); } elsif ($text eq " ") { # next food finish_food(); } else { $text =~ tr/\r\n\t/ /; $text =~ s/ +/ /g; $text =~ s/^ +//; $text =~ s/^\.+//; $text =~ s/ +$//; $text =~ s/sisältää ([^, \)]+)/sis.$1/ig; $cur_food .= $text; } } } sub parse_monday { my $token = shift; if ($token->[0] eq 'T') { if ($token->[1] eq $day_names[0]) { $parse_func = \&parse_more_food; } elsif ($token->[1] eq $day_names[1]) { finish_day(); $parse_func = \&parse_more_food; } } } sub parse_week { my $token = shift; if ($token->[0] eq 'T') { if ($token->[1] =~ /Viikko: (\d+)/) { $week = $1; $parse_func = \&parse_monday; } } } sub parse_juvenes { my ($fname, $info_ref) = @_; my $p = HTML::TokeParser->new($fname) or die("Can't open file $fname"); my $title = @{$info_ref}[0]; $week = ""; $open_hours = ""; $day_id = 0; $cur_food = ""; @cur_day_foods = (); @week_foods = (); $br_is_new_food = $title eq $zip_salaattibaari_title; $cur_title = $title; $parse_func = \&parse_week; while (my $token = $p->get_token) { &$parse_func($token); } push @restaurants, [ $title, $open_hours, $week, [ @week_foods ], $info_ref ]; } sub can_merge_bio_kliininen { my $day = shift; my $bio_foods = ""; my $kliininen_foods = ""; my $food_dest; foreach my $r (@restaurants) { my ($title, $open_hours, $week, $week_foods_ref) = @{$r}; if ($title eq $bio_title) { $food_dest = \$bio_foods; } elsif ($title eq $kliininen_title) { $food_dest = \$kliininen_foods; } else { next; } my @week_foods = @{$week_foods_ref}; foreach my $food (@{$week_foods[$day]}) { ${$food_dest} .= "$food\n"; } } return $bio_foods eq $kliininen_foods; } sub try_merge_bio_kliininen { my ($title_ref, $day) = @_; my $title = $$title_ref; if ($title eq $bio_title && can_merge_bio_kliininen($day)) { $$title_ref .= " + Kliininen"; } elsif ($title eq $kliininen_title && can_merge_bio_kliininen($day)) { return 1; } return 0; } sub get_juvenes_restaurants { my $use_old = shift; my $count = 0; foreach my $i (@restaurant_info) { my @info = @{$i}; my $temp_fname = "juvenes$count.temp.html"; my $url = $info[1]; if (!-f $temp_fname || !$use_old) { system("wget -q --timeout=10 -O $temp_fname.tmp '$url' && mv $temp_fname.tmp $temp_fname"); } if (-f $temp_fname) { parse_juvenes($temp_fname, \@info); } $count++; } return @restaurants; } 1;