diff --git a/src/db.py b/src/db.py index 83688b6..df9b4f3 100644 --- a/src/db.py +++ b/src/db.py @@ -48,6 +48,7 @@ class RecipeIngredientParts(Base): id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True) quantity = Column(String) unit = Column(String) + instruction = Column(String) ingredient = Column(String) supplement = Column(String) diff --git a/src/scrape.py b/src/scrape.py index d3dd0e0..06ca61f 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -12,17 +12,18 @@ import logging from argparse import ArgumentParser def parse_ingredient(ingredient_text): + units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch', 'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear', 'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag', 'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf', 'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb', 'year', 'fillet', 'litter', 'packet', 'slices'] - instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled', 'chunky', - 'small', 'medium', 'large', 'couarse(:?ly)?', 'cracked', + instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled', + 'chunky', 'small', 'medium', 'large', 'couarse', 'cracked', 'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut', 'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned', - 'dried', 'dry', 'fine(?:ly)', 'firmly', 'fresh', 'frozen', + 'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen', 'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover', 'light', 'lite', 'mashed', 'melted', 'minced', 'packed', 'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated', @@ -36,9 +37,13 @@ def parse_ingredient(ingredient_text): units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' for unit in units]) units_regex = f"((?:(?:{units_regex})e?s?)?)" + instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}' + for inst in instructions]) + instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)" - regex = re.compile(number_regex + + regex = re.compile(number_regex + units_regex + + instructions_regex + ingredient_regex + supplement_regex) @@ -47,7 +52,9 @@ def parse_ingredient(ingredient_text): if not m: return None - return [text.strip() for text in m.groups()] + return [text.strip() if text else None for text in m.groups()] + + def reparse_ingredients(session): cte = (except_(select(db.RecipeIngredient.id), @@ -59,14 +66,16 @@ def reparse_ingredients(session): parts = parse_ingredient(ingredient.text) if not parts: continue - quantity, unit, name, supplement = parts + quantity, unit, instruction, name, supplement = parts session.add(db.RecipeIngredientParts(id = ingredient.id, quantity = quantity, unit = unit, + instruction = instruction, ingredient = name, supplement = supplement)) + def load_recipe(recipe_url): try: logging.info(f'Loading Recipe: {recipe_url}') @@ -107,10 +116,11 @@ def parse_recipe(session, recipe, site): parts = parse_ingredient(ingred.text) if parts: - quantity, unit, ingredient, supplement = parts + quantity, unit, instruction,ingredient, supplement = parts ingred_parts = db.RecipeIngredientParts(id = ingred.id, quantity = quantity, unit = unit, + instruction = instruction, ingredient = ingredient, supplement = supplement) session.add(ingred_parts)