import pnalib import html.parser import re url = "http://www.campusravita.fi/ruokalista"; restaurant_info = [ [ "(TAMK) Campus Food", url, "", "middle" ], ] class Tracker(object): def __init__(self, tag, attr_match=None, on_started=None, on_ended=None, on_data=None): self.tag = tag self.attr_match = attr_match self.on_started = on_started self.on_ended = on_ended self.on_data = on_data self.nesting = 0 def handle_starttag(self, tag, attrs): if self.tag == tag: if self.nesting: self.nesting += 1 else: attrs_matched = False if self.attr_match is None: attrs_matched = True else: for attr in attrs: if attr[0] == self.attr_match[0] and self.attr_match[1].match(attr[1]): attrs_matched = True if attrs_matched: self.nesting = 1 if self.on_started: self.on_started() def handle_endtag(self, tag): if self.nesting and self.tag == tag: self.nesting -= 1 if self.nesting == 0 and self.on_ended: self.on_ended() def handle_data(self, data): if self.nesting and self.on_data: self.on_data(data) def __bool__(self): return self.nesting > 0 class CampusravitaHTMLParser(html.parser.HTMLParser): week_re = re.compile("Ruokalista - Viikko (\d+)") lunch_re = re.compile("Lounas|Deli-lounas") week = None def __init__(self): html.parser.HTMLParser.__init__(self) self._trackers = [] self.in_h3 = self._register_tracker("h3", on_data=self.handle_h3) # Everything in inside menu self.in_menu = self._register_tracker("section", ("id", "block-system-main"), on_started=self.handle_menu_start, on_ended=self.handle_menu_end) # Date comes after menu self.in_date_display = self._register_tracker("span", ("class", "date-display-single"), on_data=self.handle_date_display) # Lunch element contains one meal self.in_lunch = self._register_tracker("div", ("about", r"/fi/field-collection/field-ruoka-annos/\d+"), on_started=self.handle_lunch_start, on_ended=self.handle_lunch_end) # Next element contains food name self.in_lunch_food = self._register_tracker("div", ("class", ".*field-name-field-nimi.*"), on_data=self.handle_lunch_food) # Next element contains food allergies self.in_allergy = self._register_tracker("div", ("class", ".*field-name-field-ruokavaliot.*"), on_started=self.handle_allergy_start, on_ended=self.handle_allergy_end) # Next element contains allergy short name self.in_allergy_short = self._register_tracker("div", ("class", ".*field-name-title field-type-ds.*"), on_data=self.handle_allergy) # Next element contains lunch price self.in_lunch_price = self._register_tracker("div", ("class", ".*field-name-field-annoksen-hinta.*")) self.lunch_type_match = False self.lunch = None self.week_foods = {} def _register_tracker(self, tag, attr_match=None, **kwargs): tracker = Tracker(tag, (attr_match[0], re.compile(attr_match[1])) if attr_match else None, **kwargs) self._trackers.append(tracker) return tracker def handle_date_display(self, data): index = -1 if "Maanantai" in data: index = 0 elif "Tiistai" in data: index = 1 elif "Keskiviikko" in data: index = 2 elif "Torstai" in data: index = 3 elif "Perjantai" in data: index = 4 elif "Lauantai" in data: index = 5 elif "Sunnuntai" in data: index = 6 if index >= 0: self.current_day = [] self.week_foods[index] = self.current_day def handle_h3(self, data): if self.in_menu: lunch_match = self.lunch_re.match(data) self.lunch_type_match = bool(lunch_match) def handle_menu_start(self): pass def handle_menu_end(self): pass def handle_allergy(self, data): data = data.strip() if self.in_allergy and self.in_allergy_short and self.lunch and data: self.lunch["allergies"].append(data) def handle_allergy_start(self): pass def handle_allergy_end(self): pass def handle_lunch_food(self, data): data = data.strip() if self.lunch and data: self.lunch["food"].append(data) def handle_lunch_start(self): if self.lunch_type_match: self.lunch = {"food": [], "allergies": []} def handle_lunch_end(self): if self.lunch: #print(repr(self.lunch).encode("cp1252", "ignore")) menu = "{menu} ({allergies})".format(menu=self.lunch["food"][0], allergies=", ".join(self.lunch["allergies"])) self.current_day.append(menu) self.lunch = None def handle_starttag(self, tag, attrs): for tracker in self._trackers: tracker.handle_starttag(tag, attrs) def handle_endtag(self, tag): for tracker in self._trackers: tracker.handle_endtag(tag) def handle_data(self, data): for tracker in self._trackers: tracker.handle_data(data) week_match = self.week_re.match(data) if week_match: self.week = int(week_match.group(1)) def get_restaurants(use_old, week): data = pnalib.get_file(url, "campusravita.html", use_old) parser = CampusravitaHTMLParser() parser.feed(data) restaurants = [] if parser.week is not None: restaurants.append([restaurant_info[0][0], "", parser.week, parser.week_foods, restaurant_info[0]]) return restaurants