diff --git a/src/recipe_graph/scrape.py b/src/recipe_graph/scrape.py
index f234408..a3d3dde 100644
--- a/src/recipe_graph/scrape.py
+++ b/src/recipe_graph/scrape.py
@@ -9,6 +9,7 @@ from urllib.parse import urljoin
import logging
from argparse import ArgumentParser
+
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
ingredient_regex = "([a-zA-Z '\-]+)"
@@ -89,7 +90,7 @@ def reparse_ingredients(session):
)
-def load_page(recipe_url):
+def load_page(recipe_url: str) -> bs4.BeautifulSoup:
try:
logging.info(f"Loading Page: {recipe_url}")
with req.get(recipe_url) as resp:
@@ -102,43 +103,61 @@ def load_page(recipe_url):
logging.warning(e)
+def parse_recipe_name(
+ site: db.RecipeSite,
+ page: bs4.BeautifulSoup,
+ recipe: db.Recipe,
+ url: str = None,
+) -> db.Recipe:
+ if not url:
+ url = {"site": site, "recipe": recipe}
+ name_candidates = page.find_all(class_=site.name_class)
+ if len(name_candidates) == 0:
+ raise Exception(f"Could not extract recipe name: {url}")
+ name_div = name_candidates[0]
+ recipe.name = name_div.text
+
+ logging.info(f"Adding Recipe {recipe.name} from {url}")
+
+ return recipe
+
+def parse_ingredient(
+ ingredient: db.Ingredient
+) -> db.RecipeIngredientParts:
+ parts = parse_ingredient(ingredient.text)
+ if parts:
+ quantity, unit, instruction, ingredient_name, supplement = parts
+ return db.RecipeIngredientParts(
+ id=ingredient.id,
+ quantity=quantity,
+ unit=unit,
+ instruction=instruction,
+ ingredient=ingredient_name,
+ supplement=supplement,
+ )
+
+
def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_page(recipe_url)
if not recipe_page:
return None
- name_candidates = recipe_page.find_all(class_=site.name_class)
- if len(name_candidates) == 0:
- raise Exception(f"Could not extract recipe name: {recipe_url}")
- name_div = name_candidates[0]
- recipe.name = name_div.text
-
- logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
-
+ recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
session.add(recipe)
session.flush()
- ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
- for candidate in ingred_candidates:
- ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
- session.add(ingred)
+ candidates = recipe_page.find_all(class_=site.ingredient_class)
+ for candidate in candidates:
+ ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
+ session.add(ingredient)
session.flush()
-
- parts = parse_ingredient(ingred.text)
+
+ parts = parse_ingredient(ingredient)
if parts:
- quantity, unit, instruction, ingredient, supplement = parts
- ingred_parts = db.RecipeIngredientParts(
- id=ingred.id,
- quantity=quantity,
- unit=unit,
- instruction=instruction,
- ingredient=ingredient,
- supplement=supplement,
- )
- session.add(ingred_parts)
+ session.add(parts)
- logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
+ logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
return recipe
diff --git a/test/test_scrape.py b/test/test_scrape.py
index 4a7da5d..565b451 100644
--- a/test/test_scrape.py
+++ b/test/test_scrape.py
@@ -1,7 +1,47 @@
from recipe_graph import scrape
from bs4 import BeautifulSoup
+from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
-import pytest
+from pytest import fixture
+
+
+@fixture
+def mock_site():
+ return RecipeSite(
+ name="mock-site",
+ ingredient_class="mock-ing",
+ name_class="mock-name",
+ base_url="example-site/mock-site",
+ )
+
+
+# TODO: should probably load HTML from file
+@fixture
+def mock_page():
+ return BeautifulSoup(
+ """
+