Compare commits
2 Commits
76e0438062
...
259c08fd4e
| Author | SHA1 | Date |
|---|---|---|
|
|
259c08fd4e | |
|
|
e6d150421f |
16
README.md
16
README.md
|
|
@ -88,9 +88,13 @@ docker-compose -p recipe-test up
|
|||
|
||||
running tests
|
||||
```sh
|
||||
pytest
|
||||
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
|
||||
```
|
||||
|
||||
The html report is under `htmlcov/` and can be viewed through any browser.
|
||||
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
|
||||
plugin for VS Code to view coverage in your editor.
|
||||
|
||||
**WARNINING**: If you get `ERROR at setup of test_db_connection` and
|
||||
`ERROR at setup of test_db_class_creation`, please check if testing database is
|
||||
already initiated. Testing is destructive and should be done on a fresh database.
|
||||
|
|
@ -105,16 +109,6 @@ docker-compose -p recipe-test down
|
|||
Test are written in pytest framework. Currently focused on unittest and code
|
||||
coverage. Integration tests to come.
|
||||
|
||||
To run test use:
|
||||
```sh
|
||||
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
|
||||
```
|
||||
|
||||
The html report is under `htmlcov/` and can be viewed through any browser.
|
||||
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
|
||||
plugin for VS Code to view coverage in your editor.
|
||||
|
||||
|
||||
## TODO
|
||||
> ☑ automate scraping\
|
||||
> ☑ extracting quantity and name (via regex)\
|
||||
|
|
|
|||
|
|
@ -9,41 +9,53 @@ from urllib.parse import urljoin
|
|||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def parse_ingredient(ingredient_text):
|
||||
units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch',
|
||||
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
|
||||
'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag',
|
||||
'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf',
|
||||
'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb',
|
||||
'year', 'fillet', 'litter', 'packet', 'slices']
|
||||
instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
|
||||
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
|
||||
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
|
||||
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
|
||||
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
|
||||
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
|
||||
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
|
||||
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
|
||||
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
|
||||
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
|
||||
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
|
||||
'unopened', 'unseasoned']
|
||||
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
|
||||
ingredient_regex = '([a-zA-Z \'\-]+)'
|
||||
supplement_regex = ',?(.*)'
|
||||
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
|
||||
for unit in units])
|
||||
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
|
||||
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
|
||||
ingredient_regex = "([a-zA-Z '\-]+)"
|
||||
supplement_regex = ",?(.*)"
|
||||
units_regex = "|".join(
|
||||
[f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units]
|
||||
)
|
||||
units_regex = f"((?:(?:{units_regex})e?s?)?)"
|
||||
instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}'
|
||||
for inst in instructions])
|
||||
instructions_regex = "|".join(
|
||||
[f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions]
|
||||
)
|
||||
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
|
||||
|
||||
regex = re.compile(number_regex +
|
||||
units_regex +
|
||||
instructions_regex +
|
||||
ingredient_regex +
|
||||
supplement_regex)
|
||||
|
||||
|
||||
return re.compile(
|
||||
number_regex
|
||||
+ units_regex
|
||||
+ instructions_regex
|
||||
+ ingredient_regex
|
||||
+ supplement_regex
|
||||
)
|
||||
|
||||
# TODO: load units and instructions from config.
|
||||
# Moved data into optional parameters for the time being.
|
||||
def parse_ingredient(
|
||||
ingredient_text: str,
|
||||
units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar",
|
||||
"cup", "pinch", "container", "slice", "package",
|
||||
"pound", "can", "dash", "spear", "bunch", "quart",
|
||||
"cube", "envelope", "square", "sprig", "bag", "box",
|
||||
"drop", "fluid ounce", "gallon", "head", "link",
|
||||
"loaf", "pint", "pod", "sheet", "stalk", "whole",
|
||||
"bar", "bottle", "bulb", "year", "fillet", "litter",
|
||||
"packet", "slices"],
|
||||
instructions: list[str] = [
|
||||
"and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small",
|
||||
"medium", "large", "couarse", "cracked", "crushed", "ground", "cooked",
|
||||
"cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old",
|
||||
"drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen",
|
||||
"grated", "grilled", "hard", "hot", "juliened?", "leftover", "light",
|
||||
"lite", "mashed", "melted", "minced", "packed", "peeled", "pitted",
|
||||
"sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved",
|
||||
"shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced",
|
||||
"thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled",
|
||||
"unopened", "unseasoned"],
|
||||
):
|
||||
regex = ingredient_regex(units, instructions)
|
||||
|
||||
m = regex.match(ingredient_text)
|
||||
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
||||
if not m:
|
||||
|
|
@ -52,34 +64,38 @@ def parse_ingredient(ingredient_text):
|
|||
return [text.strip() if text else None for text in m.groups()]
|
||||
|
||||
|
||||
|
||||
def reparse_ingredients(session):
|
||||
cte = (except_(select(db.RecipeIngredient.id),
|
||||
select(db.RecipeIngredientParts.id))).\
|
||||
alias('missing')
|
||||
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
||||
cte = (
|
||||
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
|
||||
).alias("missing")
|
||||
missing = (
|
||||
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
||||
)
|
||||
|
||||
for ingredient in missing:
|
||||
parts = parse_ingredient(ingredient.text)
|
||||
if not parts:
|
||||
continue
|
||||
quantity, unit, instruction, name, supplement = parts
|
||||
session.add(db.RecipeIngredientParts(id = ingredient.id,
|
||||
quantity = quantity,
|
||||
unit = unit,
|
||||
instruction = instruction,
|
||||
ingredient = name,
|
||||
supplement = supplement))
|
||||
|
||||
session.add(
|
||||
db.RecipeIngredientParts(
|
||||
id=ingredient.id,
|
||||
quantity=quantity,
|
||||
unit=unit,
|
||||
instruction=instruction,
|
||||
ingredient=name,
|
||||
supplement=supplement,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def load_page(recipe_url):
|
||||
try:
|
||||
logging.info(f'Loading Page: {recipe_url}')
|
||||
try:
|
||||
logging.info(f"Loading Page: {recipe_url}")
|
||||
with req.get(recipe_url) as resp:
|
||||
if resp.status_code == 404:
|
||||
raise Exception(f"Page does not exist (404): {recipe_url}")
|
||||
return bs4.BeautifulSoup(resp.text, 'html.parser')
|
||||
return bs4.BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||
|
|
@ -98,47 +114,57 @@ def parse_recipe(session, recipe, site):
|
|||
name_div = name_candidates[0]
|
||||
recipe.name = name_div.text
|
||||
|
||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
||||
|
||||
session.add(recipe)
|
||||
session.flush()
|
||||
|
||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for candidate in ingred_candidates:
|
||||
ingred = db.RecipeIngredient(text=candidate.text,
|
||||
recipe_id=recipe.id)
|
||||
session.add(ingred)
|
||||
ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
||||
session.add(ingred)
|
||||
session.flush()
|
||||
|
||||
|
||||
parts = parse_ingredient(ingred.text)
|
||||
if parts:
|
||||
quantity, unit, instruction,ingredient, supplement = parts
|
||||
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
|
||||
quantity = quantity,
|
||||
unit = unit,
|
||||
instruction = instruction,
|
||||
ingredient = ingredient,
|
||||
supplement = supplement)
|
||||
quantity, unit, instruction, ingredient, supplement = parts
|
||||
ingred_parts = db.RecipeIngredientParts(
|
||||
id=ingred.id,
|
||||
quantity=quantity,
|
||||
unit=unit,
|
||||
instruction=instruction,
|
||||
ingredient=ingredient,
|
||||
supplement=supplement,
|
||||
)
|
||||
session.add(ingred_parts)
|
||||
|
||||
|
||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
||||
|
||||
|
||||
return recipe
|
||||
|
||||
def main(): # pragma: no cover
|
||||
|
||||
|
||||
def main(): # pragma: no cover
|
||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||
parser.add_argument('site',
|
||||
help='Name of site')
|
||||
parser.add_argument('-id', '--identifier', dest='id',
|
||||
help='url of recipe(reletive to base url of site) or commma seperated list')
|
||||
parser.add_argument('-a', '--auto', action='store', dest='n',
|
||||
help='automaticaly generate identifier(must supply number of recipies to scrape)')
|
||||
parser.add_argument('-v', '--verbose', action='store_true')
|
||||
parser.add_argument("site", help="Name of site")
|
||||
parser.add_argument(
|
||||
"-id",
|
||||
"--identifier",
|
||||
dest="id",
|
||||
help="url of recipe(reletive to base url of site) or commma seperated list",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--auto",
|
||||
action="store",
|
||||
dest="n",
|
||||
help="automaticaly generate identifier(must supply number of recipies to scrape)",
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
args = parser.parse_args(sys.argv)
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
|
||||
|
||||
eng = db.get_engine()
|
||||
S = sessionmaker(eng)
|
||||
|
|
@ -146,32 +172,34 @@ def main(): # pragma: no cover
|
|||
with S.begin() as sess:
|
||||
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
||||
site_id = site.id
|
||||
|
||||
|
||||
recipe_ids = []
|
||||
starting_id = 0
|
||||
if args.id and not args.n:
|
||||
recipe_ids.append(args.id)
|
||||
logging.info(f'Retreiving single recipe: {args.id}')
|
||||
logging.info(f"Retreiving single recipe: {args.id}")
|
||||
elif args.n:
|
||||
if not args.id:
|
||||
last_recipe = sess.query(db.Recipe).\
|
||||
where(db.Recipe.recipe_site_id == site.id).\
|
||||
order_by(desc(db.Recipe.identifier)).\
|
||||
limit(1).\
|
||||
scalar()
|
||||
last_recipe = (
|
||||
sess.query(db.Recipe)
|
||||
.where(db.Recipe.recipe_site_id == site.id)
|
||||
.order_by(desc(db.Recipe.identifier))
|
||||
.limit(1)
|
||||
.scalar()
|
||||
)
|
||||
starting_id = int(last_recipe.identifier) + 1
|
||||
else:
|
||||
starting_id = int(args.id)
|
||||
recipe_ids = range(starting_id, starting_id+int(args.n))
|
||||
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
||||
|
||||
|
||||
|
||||
for recipe_id in recipe_ids:
|
||||
try:
|
||||
recipe_ids = range(starting_id, starting_id + int(args.n))
|
||||
logging.info(
|
||||
f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}"
|
||||
)
|
||||
|
||||
for recipe_id in recipe_ids:
|
||||
try:
|
||||
savepoint = sess.begin_nested()
|
||||
|
||||
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
||||
recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id)
|
||||
parse_recipe(sess, recipe, site)
|
||||
|
||||
savepoint.commit()
|
||||
|
|
@ -183,6 +211,6 @@ def main(): # pragma: no cover
|
|||
logging.error(e)
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
main()
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -9,4 +9,17 @@ def test_load_page():
|
|||
assert type(page) == BeautifulSoup
|
||||
|
||||
page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense")
|
||||
assert page == None
|
||||
assert page == None
|
||||
|
||||
|
||||
def test_ingredient_regex():
|
||||
regex = scrape.ingredient_regex(["cup"], ["crushed"])
|
||||
assert (
|
||||
regex.pattern
|
||||
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||
)
|
||||
regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"])
|
||||
assert (
|
||||
regex.pattern
|
||||
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue