refactor code to split into functions + preliminary regex for ingredient extraction

This commit is contained in:
Andrei Stoica 2022-07-22 12:26:58 -04:00
parent b5061caed5
commit c4c53d238a
2 changed files with 103 additions and 33 deletions

View File

@ -1,3 +1,4 @@
from typing import Text
from sqlalchemy import create_engine, Column, Integer, String, \
ForeignKey, UniqueConstraint
from sqlalchemy.engine import URL
@ -41,6 +42,16 @@ class RecipeIngredient(Base):
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
class RecipeIngredientParts(Base):
__tablename__ = 'RecipeIngredientParts'
id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
quantity = Column(String)
unit = Column(String)
ingredient = Column(String)
supplement = Column(String)
def get_engine(use_dotenv = True, **kargs):
if use_dotenv:

View File

@ -1,11 +1,88 @@
import db
import re
from sqlalchemy import select, desc
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker
import bs4
from urllib.request import urlopen
from urllib.parse import urljoin
import logging
from argparse import ArgumentParser
def parse_ingredient(ingredient_text):
units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch']
number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
ingredient_regex = '([a-zA-Z \'\-]+)'
supplement_regex = ',?(.*)'
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
for unit in units])
units_regex = f"((?:{units_regex})[s]?)"
regex = re.compile(number_regex +
units_regex +
ingredient_regex +
supplement_regex)
m = regex.match(ingredient_text)
logging.info(f"Parsed {ingredient_text}, found: {m}")
if not m:
return None
return [text.strip() for text in m.groups()]
def load_recipe(recipe_url):
try:
logging.info(f'Loading Recipe: {recipe_url}')
with urlopen(recipe_url) as f:
if f.getcode() == 404:
raise Exception(f"Recipe Does not exist: {recipe_url}")
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e)
return None
def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_recipe(recipe_url)
if not recipe_page:
return None
name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe}")
session.add(recipe)
session.flush()
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in ingred_candidates:
ingred = db.RecipeIngredient(text=candidate.text,
recipe_id=recipe.id)
session.add(ingred)
session.flush()
parts = parse_ingredient(ingred.text)
if parts:
quantity, unit, ingredient, supplement = parts
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
quantity = quantity,
unit = unit,
ingredient = ingredient,
supplement = supplement)
session.add(ingred_parts)
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
return recipe
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
@ -41,44 +118,26 @@ with S.begin() as sess:
scalar()
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = args.id
starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
for recipe_id in recipe_ids:
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
recipe_url = f'{site.base_url}/{recipe.identifier}'
logging.info(f'Loading Recipe: {recipe_url}')
try:
with urlopen(recipe_url) as f:
if f.getcode() == 404:
raise Exception(f"Recipe Does not exist: {recipe_url}")
recipe_page = bs4.BeautifulSoup(f.read().decode())
savepoint = sess.begin_nested()
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
parse_recipe(sess, recipe, site)
name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
savepoint.commit()
except KeyboardInterrupt as e:
savepoint.rollback()
break
except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e)
continue
savepoint.rollback()
logging.error(e)
break
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe}")
sess.add(recipe)
sess.flush()
ingredients = []
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for ingredient in ingred_candidates:
ingredients.append(db.RecipeIngredient(text=ingredient.text,
recipe_id=recipe.id))
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
sess.add_all(ingredients)