diff --git a/src/db.py b/src/db.py index 68047a1..83688b6 100644 --- a/src/db.py +++ b/src/db.py @@ -1,3 +1,4 @@ +from typing import Text from sqlalchemy import create_engine, Column, Integer, String, \ ForeignKey, UniqueConstraint from sqlalchemy.engine import URL @@ -40,6 +41,16 @@ class RecipeIngredient(Base): text = Column(String, nullable = False) recipe_id = Column(Integer, ForeignKey('Recipe.id')) ingredient_id = Column(Integer, ForeignKey("Ingredient.id")) + +class RecipeIngredientParts(Base): + __tablename__ = 'RecipeIngredientParts' + + id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True) + quantity = Column(String) + unit = Column(String) + ingredient = Column(String) + supplement = Column(String) + def get_engine(use_dotenv = True, **kargs): diff --git a/src/scrape.py b/src/scrape.py index c85357b..d432854 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -1,11 +1,88 @@ import db +import re from sqlalchemy import select, desc +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker import bs4 from urllib.request import urlopen +from urllib.parse import urljoin import logging from argparse import ArgumentParser +def parse_ingredient(ingredient_text): + units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch'] + number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)' + ingredient_regex = '([a-zA-Z \'\-]+)' + supplement_regex = ',?(.*)' + units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' + for unit in units]) + units_regex = f"((?:{units_regex})[s]?)" + + regex = re.compile(number_regex + + units_regex + + ingredient_regex + + supplement_regex) + + m = regex.match(ingredient_text) + logging.info(f"Parsed {ingredient_text}, found: {m}") + if not m: + return None + + return [text.strip() for text in m.groups()] + +def load_recipe(recipe_url): + try: + logging.info(f'Loading Recipe: {recipe_url}') + with urlopen(recipe_url) as f: + if f.getcode() == 404: + raise Exception(f"Recipe Does not exist: {recipe_url}") + return bs4.BeautifulSoup(f.read().decode(), 'html.parser') + + except Exception as e: + logging.warning(f"Could not download or parse recipe: {recipe_url}") + logging.warning(e) + + return None + +def parse_recipe(session, recipe, site): + recipe_url = urljoin(site.base_url, str(recipe.identifier)) + recipe_page = load_recipe(recipe_url) + if not recipe_page: + return None + + name_candidates = recipe_page.find_all(class_=site.name_class) + if len(name_candidates) == 0: + raise Exception(f"Could not extract recipe name: {recipe_url}") + name_div = name_candidates[0] + recipe.name = name_div.text + + logging.info(f"Adding Recipe {recipe}") + + session.add(recipe) + session.flush() + + ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) + for candidate in ingred_candidates: + ingred = db.RecipeIngredient(text=candidate.text, + recipe_id=recipe.id) + session.add(ingred) + session.flush() + + parts = parse_ingredient(ingred.text) + if parts: + quantity, unit, ingredient, supplement = parts + ingred_parts = db.RecipeIngredientParts(id = ingred.id, + quantity = quantity, + unit = unit, + ingredient = ingredient, + supplement = supplement) + session.add(ingred_parts) + + logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") + + return recipe + + parser = ArgumentParser(description="Scrape a recipe site for recipies") parser.add_argument('site', help='Name of site') @@ -41,44 +118,26 @@ with S.begin() as sess: scalar() starting_id = int(last_recipe.identifier) + 1 else: - starting_id = args.id + starting_id = int(args.id) recipe_ids = range(starting_id, starting_id+int(args.n)) logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') - + for recipe_id in recipe_ids: - recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) + try: + savepoint = sess.begin_nested() - recipe_url = f'{site.base_url}/{recipe.identifier}' - logging.info(f'Loading Recipe: {recipe_url}') - try: - with urlopen(recipe_url) as f: - if f.getcode() == 404: - raise Exception(f"Recipe Does not exist: {recipe_url}") - recipe_page = bs4.BeautifulSoup(f.read().decode()) - - - name_candidates = recipe_page.find_all(class_=site.name_class) - if len(name_candidates) == 0: - raise Exception(f"Could not extract recipe name: {recipe_url}") - name_div = name_candidates[0] + recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) + parse_recipe(sess, recipe, site) + + savepoint.commit() + except KeyboardInterrupt as e: + savepoint.rollback() + break except Exception as e: - logging.warning(f"Could not download or parse recipe: {recipe_url}") - logging.warning(e) - continue - - recipe.name = name_div.text - logging.info(f"Adding Recipe {recipe}") + savepoint.rollback() + logging.error(e) + break - sess.add(recipe) - sess.flush() - - ingredients = [] - ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) - for ingredient in ingred_candidates: - ingredients.append(db.RecipeIngredient(text=ingredient.text, - recipe_id=recipe.id)) - - logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") - sess.add_all(ingredients) \ No newline at end of file + \ No newline at end of file