diff --git a/src/scrape.py b/src/scrape.py index d432854..30b43df 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -1,6 +1,7 @@ +from ast import alias import db import re -from sqlalchemy import select, desc +from sqlalchemy import select, desc, exists, not_, except_ from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker import bs4 @@ -10,13 +11,15 @@ import logging from argparse import ArgumentParser def parse_ingredient(ingredient_text): - units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch'] - number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)' + units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch', + 'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear', + 'bunch', 'quart', 'cube', 'envelope', 'squars', 'sprig'] + number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)' ingredient_regex = '([a-zA-Z \'\-]+)' supplement_regex = ',?(.*)' units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' for unit in units]) - units_regex = f"((?:{units_regex})[s]?)" + units_regex = f"((?:(?:{units_regex})e?s?)?)" regex = re.compile(number_regex + units_regex + @@ -30,6 +33,24 @@ def parse_ingredient(ingredient_text): return [text.strip() for text in m.groups()] +def reparse_ingredients(session): + cte = (except_(select(db.RecipeIngredient.id), + select(db.RecipeIngredientParts.id))).\ + alias('missing') + missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all() + + for ingredient in missing: + parts = parse_ingredient(ingredient.text) + if not parts: + continue + quantity, unit, name, supplement = parts + session.add(db.RecipeIngredientParts(id = ingredient.id, + quantity = quantity, + unit = unit, + ingredient = name, + supplement = supplement)) + + def load_recipe(recipe_url): try: logging.info(f'Loading Recipe: {recipe_url}') @@ -79,7 +100,7 @@ def parse_recipe(session, recipe, site): session.add(ingred_parts) logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") - + return recipe