test parse_recipe_name
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
This commit is contained in:
parent
35fadd6638
commit
e207c359ed
|
|
@ -9,6 +9,7 @@ from urllib.parse import urljoin
|
|||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
|
||||
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
|
||||
ingredient_regex = "([a-zA-Z '\-]+)"
|
||||
|
|
@ -89,7 +90,7 @@ def reparse_ingredients(session):
|
|||
)
|
||||
|
||||
|
||||
def load_page(recipe_url):
|
||||
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
|
||||
try:
|
||||
logging.info(f"Loading Page: {recipe_url}")
|
||||
with req.get(recipe_url) as resp:
|
||||
|
|
@ -102,43 +103,61 @@ def load_page(recipe_url):
|
|||
logging.warning(e)
|
||||
|
||||
|
||||
def parse_recipe_name(
|
||||
site: db.RecipeSite,
|
||||
page: bs4.BeautifulSoup,
|
||||
recipe: db.Recipe,
|
||||
url: str = None,
|
||||
) -> db.Recipe:
|
||||
if not url:
|
||||
url = {"site": site, "recipe": recipe}
|
||||
name_candidates = page.find_all(class_=site.name_class)
|
||||
if len(name_candidates) == 0:
|
||||
raise Exception(f"Could not extract recipe name: {url}")
|
||||
name_div = name_candidates[0]
|
||||
recipe.name = name_div.text
|
||||
|
||||
logging.info(f"Adding Recipe {recipe.name} from {url}")
|
||||
|
||||
return recipe
|
||||
|
||||
def parse_ingredient(
|
||||
ingredient: db.Ingredient
|
||||
) -> db.RecipeIngredientParts:
|
||||
parts = parse_ingredient(ingredient.text)
|
||||
if parts:
|
||||
quantity, unit, instruction, ingredient_name, supplement = parts
|
||||
return db.RecipeIngredientParts(
|
||||
id=ingredient.id,
|
||||
quantity=quantity,
|
||||
unit=unit,
|
||||
instruction=instruction,
|
||||
ingredient=ingredient_name,
|
||||
supplement=supplement,
|
||||
)
|
||||
|
||||
|
||||
def parse_recipe(session, recipe, site):
|
||||
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
||||
recipe_page = load_page(recipe_url)
|
||||
if not recipe_page:
|
||||
return None
|
||||
|
||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
||||
if len(name_candidates) == 0:
|
||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
||||
name_div = name_candidates[0]
|
||||
recipe.name = name_div.text
|
||||
|
||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
||||
|
||||
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
|
||||
session.add(recipe)
|
||||
session.flush()
|
||||
|
||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for candidate in ingred_candidates:
|
||||
ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
||||
session.add(ingred)
|
||||
candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for candidate in candidates:
|
||||
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
||||
session.add(ingredient)
|
||||
session.flush()
|
||||
|
||||
parts = parse_ingredient(ingred.text)
|
||||
|
||||
parts = parse_ingredient(ingredient)
|
||||
if parts:
|
||||
quantity, unit, instruction, ingredient, supplement = parts
|
||||
ingred_parts = db.RecipeIngredientParts(
|
||||
id=ingred.id,
|
||||
quantity=quantity,
|
||||
unit=unit,
|
||||
instruction=instruction,
|
||||
ingredient=ingredient,
|
||||
supplement=supplement,
|
||||
)
|
||||
session.add(ingred_parts)
|
||||
session.add(parts)
|
||||
|
||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
||||
logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
|
||||
|
||||
return recipe
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,47 @@
|
|||
from recipe_graph import scrape
|
||||
from bs4 import BeautifulSoup
|
||||
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
|
||||
|
||||
import pytest
|
||||
from pytest import fixture
|
||||
|
||||
|
||||
@fixture
|
||||
def mock_site():
|
||||
return RecipeSite(
|
||||
name="mock-site",
|
||||
ingredient_class="mock-ing",
|
||||
name_class="mock-name",
|
||||
base_url="example-site/mock-site",
|
||||
)
|
||||
|
||||
|
||||
# TODO: should probably load HTML from file
|
||||
@fixture
|
||||
def mock_page():
|
||||
return BeautifulSoup(
|
||||
"""
|
||||
<header></header><body>
|
||||
<div class="mock-name">test_recipe</div>
|
||||
<div class="mock-ing">test_ingredient</div>
|
||||
</body>
|
||||
""",
|
||||
"html.parser",
|
||||
)
|
||||
|
||||
|
||||
@fixture
|
||||
def mock_blank_page():
|
||||
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
|
||||
|
||||
|
||||
@fixture
|
||||
def mock_recipe():
|
||||
return Recipe(name="test_recipe", identifier="mock_1")
|
||||
|
||||
|
||||
@fixture
|
||||
def mock_url():
|
||||
return "example-site/mock-site"
|
||||
|
||||
|
||||
def test_load_page():
|
||||
|
|
@ -23,3 +63,18 @@ def test_ingredient_regex():
|
|||
regex.pattern
|
||||
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||
)
|
||||
|
||||
|
||||
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url):
|
||||
expected_name = mock_recipe.name
|
||||
mock_recipe.name = None
|
||||
|
||||
mock_recipe = scrape.parse_recipe_name(
|
||||
mock_site,
|
||||
mock_page,
|
||||
mock_recipe,
|
||||
mock_url,
|
||||
)
|
||||
assert mock_recipe.name == expected_name
|
||||
# assert False
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue