Compare commits

...

2 Commits

Author SHA1 Message Date
Andrei Stoica 259c08fd4e added test for creating regex
continuous-integration/drone/push Build is passing Details
2023-05-18 16:02:14 -04:00
Andrei Stoica e6d150421f updated readme 2023-05-18 11:36:17 -04:00
3 changed files with 137 additions and 102 deletions

View File

@ -88,9 +88,13 @@ docker-compose -p recipe-test up
running tests running tests
```sh ```sh
pytest pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
``` ```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
**WARNINING**: If you get `ERROR at setup of test_db_connection` and **WARNINING**: If you get `ERROR at setup of test_db_connection` and
`ERROR at setup of test_db_class_creation`, please check if testing database is `ERROR at setup of test_db_class_creation`, please check if testing database is
already initiated. Testing is destructive and should be done on a fresh database. already initiated. Testing is destructive and should be done on a fresh database.
@ -105,16 +109,6 @@ docker-compose -p recipe-test down
Test are written in pytest framework. Currently focused on unittest and code Test are written in pytest framework. Currently focused on unittest and code
coverage. Integration tests to come. coverage. Integration tests to come.
To run test use:
```sh
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
## TODO ## TODO
> ☑ automate scraping\ > ☑ automate scraping\
> ☑ extracting quantity and name (via regex)\ > ☑ extracting quantity and name (via regex)\

View File

@ -9,41 +9,53 @@ from urllib.parse import urljoin
import logging import logging
from argparse import ArgumentParser from argparse import ArgumentParser
def parse_ingredient(ingredient_text): def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch', number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear', ingredient_regex = "([a-zA-Z '\-]+)"
'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag', supplement_regex = ",?(.*)"
'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf', units_regex = "|".join(
'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb', [f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units]
'year', 'fillet', 'litter', 'packet', 'slices'] )
instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
'unopened', 'unseasoned']
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
ingredient_regex = '([a-zA-Z \'\-]+)'
supplement_regex = ',?(.*)'
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
for unit in units])
units_regex = f"((?:(?:{units_regex})e?s?)?)" units_regex = f"((?:(?:{units_regex})e?s?)?)"
instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}' instructions_regex = "|".join(
for inst in instructions]) [f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions]
)
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)" instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
regex = re.compile(number_regex + return re.compile(
units_regex + number_regex
instructions_regex + + units_regex
ingredient_regex + + instructions_regex
supplement_regex) + ingredient_regex
+ supplement_regex
)
# TODO: load units and instructions from config.
# Moved data into optional parameters for the time being.
def parse_ingredient(
ingredient_text: str,
units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar",
"cup", "pinch", "container", "slice", "package",
"pound", "can", "dash", "spear", "bunch", "quart",
"cube", "envelope", "square", "sprig", "bag", "box",
"drop", "fluid ounce", "gallon", "head", "link",
"loaf", "pint", "pod", "sheet", "stalk", "whole",
"bar", "bottle", "bulb", "year", "fillet", "litter",
"packet", "slices"],
instructions: list[str] = [
"and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small",
"medium", "large", "couarse", "cracked", "crushed", "ground", "cooked",
"cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old",
"drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen",
"grated", "grilled", "hard", "hot", "juliened?", "leftover", "light",
"lite", "mashed", "melted", "minced", "packed", "peeled", "pitted",
"sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved",
"shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced",
"thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled",
"unopened", "unseasoned"],
):
regex = ingredient_regex(units, instructions)
m = regex.match(ingredient_text) m = regex.match(ingredient_text)
logging.info(f"Parsed {ingredient_text}, found: {m}") logging.info(f"Parsed {ingredient_text}, found: {m}")
if not m: if not m:
@ -52,34 +64,38 @@ def parse_ingredient(ingredient_text):
return [text.strip() if text else None for text in m.groups()] return [text.strip() if text else None for text in m.groups()]
def reparse_ingredients(session): def reparse_ingredients(session):
cte = (except_(select(db.RecipeIngredient.id), cte = (
select(db.RecipeIngredientParts.id))).\ except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
alias('missing') ).alias("missing")
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all() missing = (
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
)
for ingredient in missing: for ingredient in missing:
parts = parse_ingredient(ingredient.text) parts = parse_ingredient(ingredient.text)
if not parts: if not parts:
continue continue
quantity, unit, instruction, name, supplement = parts quantity, unit, instruction, name, supplement = parts
session.add(db.RecipeIngredientParts(id = ingredient.id, session.add(
quantity = quantity, db.RecipeIngredientParts(
unit = unit, id=ingredient.id,
instruction = instruction, quantity=quantity,
ingredient = name, unit=unit,
supplement = supplement)) instruction=instruction,
ingredient=name,
supplement=supplement,
)
)
def load_page(recipe_url): def load_page(recipe_url):
try: try:
logging.info(f'Loading Page: {recipe_url}') logging.info(f"Loading Page: {recipe_url}")
with req.get(recipe_url) as resp: with req.get(recipe_url) as resp:
if resp.status_code == 404: if resp.status_code == 404:
raise Exception(f"Page does not exist (404): {recipe_url}") raise Exception(f"Page does not exist (404): {recipe_url}")
return bs4.BeautifulSoup(resp.text, 'html.parser') return bs4.BeautifulSoup(resp.text, "html.parser")
except Exception as e: except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}") logging.warning(f"Could not download or parse recipe: {recipe_url}")
@ -98,47 +114,57 @@ def parse_recipe(session, recipe, site):
name_div = name_candidates[0] name_div = name_candidates[0]
recipe.name = name_div.text recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}") logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
session.add(recipe) session.add(recipe)
session.flush() session.flush()
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in ingred_candidates: for candidate in ingred_candidates:
ingred = db.RecipeIngredient(text=candidate.text, ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
recipe_id=recipe.id) session.add(ingred)
session.add(ingred)
session.flush() session.flush()
parts = parse_ingredient(ingred.text) parts = parse_ingredient(ingred.text)
if parts: if parts:
quantity, unit, instruction,ingredient, supplement = parts quantity, unit, instruction, ingredient, supplement = parts
ingred_parts = db.RecipeIngredientParts(id = ingred.id, ingred_parts = db.RecipeIngredientParts(
quantity = quantity, id=ingred.id,
unit = unit, quantity=quantity,
instruction = instruction, unit=unit,
ingredient = ingredient, instruction=instruction,
supplement = supplement) ingredient=ingredient,
supplement=supplement,
)
session.add(ingred_parts) session.add(ingred_parts)
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
return recipe return recipe
def main(): # pragma: no cover
def main(): # pragma: no cover
parser = ArgumentParser(description="Scrape a recipe site for recipies") parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site', parser.add_argument("site", help="Name of site")
help='Name of site') parser.add_argument(
parser.add_argument('-id', '--identifier', dest='id', "-id",
help='url of recipe(reletive to base url of site) or commma seperated list') "--identifier",
parser.add_argument('-a', '--auto', action='store', dest='n', dest="id",
help='automaticaly generate identifier(must supply number of recipies to scrape)') help="url of recipe(reletive to base url of site) or commma seperated list",
parser.add_argument('-v', '--verbose', action='store_true') )
parser.add_argument(
"-a",
"--auto",
action="store",
dest="n",
help="automaticaly generate identifier(must supply number of recipies to scrape)",
)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args(sys.argv) args = parser.parse_args(sys.argv)
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
eng = db.get_engine() eng = db.get_engine()
S = sessionmaker(eng) S = sessionmaker(eng)
@ -146,32 +172,34 @@ def main(): # pragma: no cover
with S.begin() as sess: with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id site_id = site.id
recipe_ids = [] recipe_ids = []
starting_id = 0 starting_id = 0
if args.id and not args.n: if args.id and not args.n:
recipe_ids.append(args.id) recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}') logging.info(f"Retreiving single recipe: {args.id}")
elif args.n: elif args.n:
if not args.id: if not args.id:
last_recipe = sess.query(db.Recipe).\ last_recipe = (
where(db.Recipe.recipe_site_id == site.id).\ sess.query(db.Recipe)
order_by(desc(db.Recipe.identifier)).\ .where(db.Recipe.recipe_site_id == site.id)
limit(1).\ .order_by(desc(db.Recipe.identifier))
scalar() .limit(1)
.scalar()
)
starting_id = int(last_recipe.identifier) + 1 starting_id = int(last_recipe.identifier) + 1
else: else:
starting_id = int(args.id) starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id+int(args.n)) recipe_ids = range(starting_id, starting_id + int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') logging.info(
f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}"
)
for recipe_id in recipe_ids: for recipe_id in recipe_ids:
try: try:
savepoint = sess.begin_nested() savepoint = sess.begin_nested()
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id)
parse_recipe(sess, recipe, site) parse_recipe(sess, recipe, site)
savepoint.commit() savepoint.commit()
@ -183,6 +211,6 @@ def main(): # pragma: no cover
logging.error(e) logging.error(e)
continue continue
if __name__ == "__main__": # pragma: no cover if __name__ == "__main__": # pragma: no cover
main() main()

View File

@ -9,4 +9,17 @@ def test_load_page():
assert type(page) == BeautifulSoup assert type(page) == BeautifulSoup
page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense") page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense")
assert page == None assert page == None
def test_ingredient_regex():
regex = scrape.ingredient_regex(["cup"], ["crushed"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)