From e207c359ed12f6ef8ac90d943e6ad27d6e8a3109 Mon Sep 17 00:00:00 2001 From: Andrei Stoica Date: Thu, 18 May 2023 19:05:34 -0400 Subject: [PATCH] test parse_recipe_name --- src/recipe_graph/scrape.py | 71 ++++++++++++++++++++++++-------------- test/test_scrape.py | 57 +++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 27 deletions(-) diff --git a/src/recipe_graph/scrape.py b/src/recipe_graph/scrape.py index f234408..a3d3dde 100644 --- a/src/recipe_graph/scrape.py +++ b/src/recipe_graph/scrape.py @@ -9,6 +9,7 @@ from urllib.parse import urljoin import logging from argparse import ArgumentParser + def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern: number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)" ingredient_regex = "([a-zA-Z '\-]+)" @@ -89,7 +90,7 @@ def reparse_ingredients(session): ) -def load_page(recipe_url): +def load_page(recipe_url: str) -> bs4.BeautifulSoup: try: logging.info(f"Loading Page: {recipe_url}") with req.get(recipe_url) as resp: @@ -102,43 +103,61 @@ def load_page(recipe_url): logging.warning(e) +def parse_recipe_name( + site: db.RecipeSite, + page: bs4.BeautifulSoup, + recipe: db.Recipe, + url: str = None, +) -> db.Recipe: + if not url: + url = {"site": site, "recipe": recipe} + name_candidates = page.find_all(class_=site.name_class) + if len(name_candidates) == 0: + raise Exception(f"Could not extract recipe name: {url}") + name_div = name_candidates[0] + recipe.name = name_div.text + + logging.info(f"Adding Recipe {recipe.name} from {url}") + + return recipe + +def parse_ingredient( + ingredient: db.Ingredient +) -> db.RecipeIngredientParts: + parts = parse_ingredient(ingredient.text) + if parts: + quantity, unit, instruction, ingredient_name, supplement = parts + return db.RecipeIngredientParts( + id=ingredient.id, + quantity=quantity, + unit=unit, + instruction=instruction, + ingredient=ingredient_name, + supplement=supplement, + ) + + def parse_recipe(session, recipe, site): recipe_url = urljoin(site.base_url, str(recipe.identifier)) recipe_page = load_page(recipe_url) if not recipe_page: return None - name_candidates = recipe_page.find_all(class_=site.name_class) - if len(name_candidates) == 0: - raise Exception(f"Could not extract recipe name: {recipe_url}") - name_div = name_candidates[0] - recipe.name = name_div.text - - logging.info(f"Adding Recipe {recipe.name} from {recipe_url}") - + recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url) session.add(recipe) session.flush() - ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) - for candidate in ingred_candidates: - ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id) - session.add(ingred) + candidates = recipe_page.find_all(class_=site.ingredient_class) + for candidate in candidates: + ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id) + session.add(ingredient) session.flush() - - parts = parse_ingredient(ingred.text) + + parts = parse_ingredient(ingredient) if parts: - quantity, unit, instruction, ingredient, supplement = parts - ingred_parts = db.RecipeIngredientParts( - id=ingred.id, - quantity=quantity, - unit=unit, - instruction=instruction, - ingredient=ingredient, - supplement=supplement, - ) - session.add(ingred_parts) + session.add(parts) - logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") + logging.info(f"{len(candidates)} ingredients found. Inserting into DB") return recipe diff --git a/test/test_scrape.py b/test/test_scrape.py index 4a7da5d..565b451 100644 --- a/test/test_scrape.py +++ b/test/test_scrape.py @@ -1,7 +1,47 @@ from recipe_graph import scrape from bs4 import BeautifulSoup +from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts -import pytest +from pytest import fixture + + +@fixture +def mock_site(): + return RecipeSite( + name="mock-site", + ingredient_class="mock-ing", + name_class="mock-name", + base_url="example-site/mock-site", + ) + + +# TODO: should probably load HTML from file +@fixture +def mock_page(): + return BeautifulSoup( + """ +
+
test_recipe
+
test_ingredient
+ + """, + "html.parser", + ) + + +@fixture +def mock_blank_page(): + return BeautifulSoup("""
""", "html.parser") + + +@fixture +def mock_recipe(): + return Recipe(name="test_recipe", identifier="mock_1") + + +@fixture +def mock_url(): + return "example-site/mock-site" def test_load_page(): @@ -23,3 +63,18 @@ def test_ingredient_regex(): regex.pattern == "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)" ) + + +def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url): + expected_name = mock_recipe.name + mock_recipe.name = None + + mock_recipe = scrape.parse_recipe_name( + mock_site, + mock_page, + mock_recipe, + mock_url, + ) + assert mock_recipe.name == expected_name + # assert False +