test parse_recipe_name

2023-05-18 19:05:34 -04:00 · 2023-05-18 19:05:34 -04:00 · e207c359ed
parent 35fadd6638
commit e207c359ed
2 changed files with 101 additions and 27 deletions
--- a/src/recipe_graph/scrape.py
+++ b/src/recipe_graph/scrape.py
@ -9,6 +9,7 @@ from urllib.parse import urljoin
 import logging
 from argparse import ArgumentParser
 def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
    number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
    ingredient_regex = "([a-zA-Z '\-]+)"
@ -89,7 +90,7 @@ def reparse_ingredients(session):
        )
-def load_page(recipe_url):
+def load_page(recipe_url: str) -> bs4.BeautifulSoup:
    try:
        logging.info(f"Loading Page: {recipe_url}")
        with req.get(recipe_url) as resp:
@ -102,43 +103,61 @@ def load_page(recipe_url):
        logging.warning(e)
 def parse_recipe_name(
    site: db.RecipeSite,
    page: bs4.BeautifulSoup,
    recipe: db.Recipe,
    url: str = None,
 ) -> db.Recipe:
    if not url:
        url = {"site": site, "recipe": recipe}
    name_candidates = page.find_all(class_=site.name_class)
    if len(name_candidates) == 0:
        raise Exception(f"Could not extract recipe name: {url}")
    name_div = name_candidates[0]
    recipe.name = name_div.text
    logging.info(f"Adding Recipe {recipe.name} from {url}")
    return recipe
 def parse_ingredient(
    ingredient: db.Ingredient
 ) -> db.RecipeIngredientParts:
    parts = parse_ingredient(ingredient.text)
    if parts:
        quantity, unit, instruction, ingredient_name, supplement = parts
        return db.RecipeIngredientParts(
            id=ingredient.id,
            quantity=quantity,
            unit=unit,
            instruction=instruction,
            ingredient=ingredient_name,
            supplement=supplement,
        )
 def parse_recipe(session, recipe, site):
    recipe_url = urljoin(site.base_url, str(recipe.identifier))
    recipe_page = load_page(recipe_url)
    if not recipe_page:
        return None
-    name_candidates = recipe_page.find_all(class_=site.name_class)
+    recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
    if len(name_candidates) == 0:
        raise Exception(f"Could not extract recipe name: {recipe_url}")
    name_div = name_candidates[0]
    recipe.name = name_div.text
    logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
    session.add(recipe)
    session.flush()
-    ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
+    candidates = recipe_page.find_all(class_=site.ingredient_class)
-    for candidate in ingred_candidates:
+    for candidate in candidates:
-        ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
+        ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
-        session.add(ingred)
+        session.add(ingredient)
        session.flush()
-        parts = parse_ingredient(ingred.text)
+        parts = parse_ingredient(ingredient)
        if parts:
-            quantity, unit, instruction, ingredient, supplement = parts
+            session.add(parts)
            ingred_parts = db.RecipeIngredientParts(
                id=ingred.id,
                quantity=quantity,
                unit=unit,
                instruction=instruction,
                ingredient=ingredient,
                supplement=supplement,
            )
            session.add(ingred_parts)
-    logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
+    logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
    return recipe
--- a/test/test_scrape.py
+++ b/test/test_scrape.py
@ -1,7 +1,47 @@
 from recipe_graph import scrape
 from bs4 import BeautifulSoup
 from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
-import pytest
+from pytest import fixture
@fixture
 def mock_site():
    return RecipeSite(
        name="mock-site",
        ingredient_class="mock-ing",
        name_class="mock-name",
        base_url="example-site/mock-site",
    )
 # TODO: should probably load HTML from file
@fixture
 def mock_page():
    return BeautifulSoup(
        """
                    <header></header><body>
                        <div class="mock-name">test_recipe</div>
                        <div class="mock-ing">test_ingredient</div>
                    </body>
                  """,
        "html.parser",
    )
@fixture
 def mock_blank_page():
    return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
@fixture
 def mock_recipe():
    return Recipe(name="test_recipe", identifier="mock_1")
@fixture
 def mock_url():
    return "example-site/mock-site"
 def test_load_page():
@ -23,3 +63,18 @@ def test_ingredient_regex():
        regex.pattern
        == "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
    )
 def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url):
    expected_name = mock_recipe.name
    mock_recipe.name = None
    mock_recipe = scrape.parse_recipe_name(
        mock_site,
        mock_page,
        mock_recipe,
        mock_url,
    )
    assert mock_recipe.name == expected_name
    # assert False