From 259c08fd4e258d47b09c9b6d3dd9dce4fa5eef1b Mon Sep 17 00:00:00 2001 From: Andrei Stoica Date: Thu, 18 May 2023 16:02:14 -0400 Subject: [PATCH] added test for creating regex --- src/recipe_graph/scrape.py | 208 +++++++++++++++++++++---------------- test/test_scrape.py | 15 ++- 2 files changed, 132 insertions(+), 91 deletions(-) diff --git a/src/recipe_graph/scrape.py b/src/recipe_graph/scrape.py index 9a9b6b6..f234408 100644 --- a/src/recipe_graph/scrape.py +++ b/src/recipe_graph/scrape.py @@ -9,41 +9,53 @@ from urllib.parse import urljoin import logging from argparse import ArgumentParser -def parse_ingredient(ingredient_text): - units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch', - 'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear', - 'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag', - 'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf', - 'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb', - 'year', 'fillet', 'litter', 'packet', 'slices'] - instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled', - 'chunky', 'small', 'medium', 'large', 'couarse', 'cracked', - 'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut', - 'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned', - 'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen', - 'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover', - 'light', 'lite', 'mashed', 'melted', 'minced', 'packed', - 'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated', - 'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted', - 'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin', - 'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled', - 'unopened', 'unseasoned'] - number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)' - ingredient_regex = '([a-zA-Z \'\-]+)' - supplement_regex = ',?(.*)' - units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' - for unit in units]) +def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern: + number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)" + ingredient_regex = "([a-zA-Z '\-]+)" + supplement_regex = ",?(.*)" + units_regex = "|".join( + [f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units] + ) units_regex = f"((?:(?:{units_regex})e?s?)?)" - instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}' - for inst in instructions]) + instructions_regex = "|".join( + [f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions] + ) instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)" - - regex = re.compile(number_regex + - units_regex + - instructions_regex + - ingredient_regex + - supplement_regex) - + + return re.compile( + number_regex + + units_regex + + instructions_regex + + ingredient_regex + + supplement_regex + ) + +# TODO: load units and instructions from config. +# Moved data into optional parameters for the time being. +def parse_ingredient( + ingredient_text: str, + units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar", + "cup", "pinch", "container", "slice", "package", + "pound", "can", "dash", "spear", "bunch", "quart", + "cube", "envelope", "square", "sprig", "bag", "box", + "drop", "fluid ounce", "gallon", "head", "link", + "loaf", "pint", "pod", "sheet", "stalk", "whole", + "bar", "bottle", "bulb", "year", "fillet", "litter", + "packet", "slices"], + instructions: list[str] = [ + "and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small", + "medium", "large", "couarse", "cracked", "crushed", "ground", "cooked", + "cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old", + "drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen", + "grated", "grilled", "hard", "hot", "juliened?", "leftover", "light", + "lite", "mashed", "melted", "minced", "packed", "peeled", "pitted", + "sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved", + "shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced", + "thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled", + "unopened", "unseasoned"], +): + regex = ingredient_regex(units, instructions) + m = regex.match(ingredient_text) logging.info(f"Parsed {ingredient_text}, found: {m}") if not m: @@ -52,34 +64,38 @@ def parse_ingredient(ingredient_text): return [text.strip() if text else None for text in m.groups()] - def reparse_ingredients(session): - cte = (except_(select(db.RecipeIngredient.id), - select(db.RecipeIngredientParts.id))).\ - alias('missing') - missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all() + cte = ( + except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id)) + ).alias("missing") + missing = ( + session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all() + ) for ingredient in missing: parts = parse_ingredient(ingredient.text) if not parts: continue quantity, unit, instruction, name, supplement = parts - session.add(db.RecipeIngredientParts(id = ingredient.id, - quantity = quantity, - unit = unit, - instruction = instruction, - ingredient = name, - supplement = supplement)) - + session.add( + db.RecipeIngredientParts( + id=ingredient.id, + quantity=quantity, + unit=unit, + instruction=instruction, + ingredient=name, + supplement=supplement, + ) + ) def load_page(recipe_url): - try: - logging.info(f'Loading Page: {recipe_url}') + try: + logging.info(f"Loading Page: {recipe_url}") with req.get(recipe_url) as resp: if resp.status_code == 404: raise Exception(f"Page does not exist (404): {recipe_url}") - return bs4.BeautifulSoup(resp.text, 'html.parser') + return bs4.BeautifulSoup(resp.text, "html.parser") except Exception as e: logging.warning(f"Could not download or parse recipe: {recipe_url}") @@ -98,47 +114,57 @@ def parse_recipe(session, recipe, site): name_div = name_candidates[0] recipe.name = name_div.text - logging.info(f"Adding Recipe {recipe.name} from {recipe_url}") + logging.info(f"Adding Recipe {recipe.name} from {recipe_url}") session.add(recipe) session.flush() ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) for candidate in ingred_candidates: - ingred = db.RecipeIngredient(text=candidate.text, - recipe_id=recipe.id) - session.add(ingred) + ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id) + session.add(ingred) session.flush() - + parts = parse_ingredient(ingred.text) if parts: - quantity, unit, instruction,ingredient, supplement = parts - ingred_parts = db.RecipeIngredientParts(id = ingred.id, - quantity = quantity, - unit = unit, - instruction = instruction, - ingredient = ingredient, - supplement = supplement) + quantity, unit, instruction, ingredient, supplement = parts + ingred_parts = db.RecipeIngredientParts( + id=ingred.id, + quantity=quantity, + unit=unit, + instruction=instruction, + ingredient=ingredient, + supplement=supplement, + ) session.add(ingred_parts) - + logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") - + return recipe - -def main(): # pragma: no cover + + +def main(): # pragma: no cover parser = ArgumentParser(description="Scrape a recipe site for recipies") - parser.add_argument('site', - help='Name of site') - parser.add_argument('-id', '--identifier', dest='id', - help='url of recipe(reletive to base url of site) or commma seperated list') - parser.add_argument('-a', '--auto', action='store', dest='n', - help='automaticaly generate identifier(must supply number of recipies to scrape)') - parser.add_argument('-v', '--verbose', action='store_true') + parser.add_argument("site", help="Name of site") + parser.add_argument( + "-id", + "--identifier", + dest="id", + help="url of recipe(reletive to base url of site) or commma seperated list", + ) + parser.add_argument( + "-a", + "--auto", + action="store", + dest="n", + help="automaticaly generate identifier(must supply number of recipies to scrape)", + ) + parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args(sys.argv) if args.verbose: logging.basicConfig(level=logging.INFO) - logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO) eng = db.get_engine() S = sessionmaker(eng) @@ -146,32 +172,34 @@ def main(): # pragma: no cover with S.begin() as sess: site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() site_id = site.id - + recipe_ids = [] starting_id = 0 if args.id and not args.n: recipe_ids.append(args.id) - logging.info(f'Retreiving single recipe: {args.id}') + logging.info(f"Retreiving single recipe: {args.id}") elif args.n: if not args.id: - last_recipe = sess.query(db.Recipe).\ - where(db.Recipe.recipe_site_id == site.id).\ - order_by(desc(db.Recipe.identifier)).\ - limit(1).\ - scalar() + last_recipe = ( + sess.query(db.Recipe) + .where(db.Recipe.recipe_site_id == site.id) + .order_by(desc(db.Recipe.identifier)) + .limit(1) + .scalar() + ) starting_id = int(last_recipe.identifier) + 1 else: starting_id = int(args.id) - recipe_ids = range(starting_id, starting_id+int(args.n)) - logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') - - - - for recipe_id in recipe_ids: - try: + recipe_ids = range(starting_id, starting_id + int(args.n)) + logging.info( + f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}" + ) + + for recipe_id in recipe_ids: + try: savepoint = sess.begin_nested() - recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) + recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id) parse_recipe(sess, recipe, site) savepoint.commit() @@ -183,6 +211,6 @@ def main(): # pragma: no cover logging.error(e) continue - -if __name__ == "__main__": # pragma: no cover - main() \ No newline at end of file + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/test/test_scrape.py b/test/test_scrape.py index 751d193..b7add03 100644 --- a/test/test_scrape.py +++ b/test/test_scrape.py @@ -9,4 +9,17 @@ def test_load_page(): assert type(page) == BeautifulSoup page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense") - assert page == None \ No newline at end of file + assert page == None + + +def test_ingredient_regex(): + regex = scrape.ingredient_regex(["cup"], ["crushed"]) + assert ( + regex.pattern + == "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)" + ) + regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"]) + assert ( + regex.pattern + == "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)" + )