refactor code to split into functions + preliminary regex for ingredient extraction
This commit is contained in:
parent
b5061caed5
commit
c4c53d238a
11
src/db.py
11
src/db.py
|
|
@ -1,3 +1,4 @@
|
|||
from typing import Text
|
||||
from sqlalchemy import create_engine, Column, Integer, String, \
|
||||
ForeignKey, UniqueConstraint
|
||||
from sqlalchemy.engine import URL
|
||||
|
|
@ -40,6 +41,16 @@ class RecipeIngredient(Base):
|
|||
text = Column(String, nullable = False)
|
||||
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
|
||||
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
|
||||
|
||||
class RecipeIngredientParts(Base):
|
||||
__tablename__ = 'RecipeIngredientParts'
|
||||
|
||||
id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
|
||||
quantity = Column(String)
|
||||
unit = Column(String)
|
||||
ingredient = Column(String)
|
||||
supplement = Column(String)
|
||||
|
||||
|
||||
|
||||
def get_engine(use_dotenv = True, **kargs):
|
||||
|
|
|
|||
125
src/scrape.py
125
src/scrape.py
|
|
@ -1,11 +1,88 @@
|
|||
import db
|
||||
import re
|
||||
from sqlalchemy import select, desc
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import bs4
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urljoin
|
||||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def parse_ingredient(ingredient_text):
|
||||
units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch']
|
||||
number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
|
||||
ingredient_regex = '([a-zA-Z \'\-]+)'
|
||||
supplement_regex = ',?(.*)'
|
||||
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
|
||||
for unit in units])
|
||||
units_regex = f"((?:{units_regex})[s]?)"
|
||||
|
||||
regex = re.compile(number_regex +
|
||||
units_regex +
|
||||
ingredient_regex +
|
||||
supplement_regex)
|
||||
|
||||
m = regex.match(ingredient_text)
|
||||
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
||||
if not m:
|
||||
return None
|
||||
|
||||
return [text.strip() for text in m.groups()]
|
||||
|
||||
def load_recipe(recipe_url):
|
||||
try:
|
||||
logging.info(f'Loading Recipe: {recipe_url}')
|
||||
with urlopen(recipe_url) as f:
|
||||
if f.getcode() == 404:
|
||||
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
||||
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||
logging.warning(e)
|
||||
|
||||
return None
|
||||
|
||||
def parse_recipe(session, recipe, site):
|
||||
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
||||
recipe_page = load_recipe(recipe_url)
|
||||
if not recipe_page:
|
||||
return None
|
||||
|
||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
||||
if len(name_candidates) == 0:
|
||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
||||
name_div = name_candidates[0]
|
||||
recipe.name = name_div.text
|
||||
|
||||
logging.info(f"Adding Recipe {recipe}")
|
||||
|
||||
session.add(recipe)
|
||||
session.flush()
|
||||
|
||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for candidate in ingred_candidates:
|
||||
ingred = db.RecipeIngredient(text=candidate.text,
|
||||
recipe_id=recipe.id)
|
||||
session.add(ingred)
|
||||
session.flush()
|
||||
|
||||
parts = parse_ingredient(ingred.text)
|
||||
if parts:
|
||||
quantity, unit, ingredient, supplement = parts
|
||||
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
|
||||
quantity = quantity,
|
||||
unit = unit,
|
||||
ingredient = ingredient,
|
||||
supplement = supplement)
|
||||
session.add(ingred_parts)
|
||||
|
||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
||||
|
||||
return recipe
|
||||
|
||||
|
||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||
parser.add_argument('site',
|
||||
help='Name of site')
|
||||
|
|
@ -41,44 +118,26 @@ with S.begin() as sess:
|
|||
scalar()
|
||||
starting_id = int(last_recipe.identifier) + 1
|
||||
else:
|
||||
starting_id = args.id
|
||||
starting_id = int(args.id)
|
||||
recipe_ids = range(starting_id, starting_id+int(args.n))
|
||||
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
||||
|
||||
|
||||
|
||||
|
||||
for recipe_id in recipe_ids:
|
||||
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
||||
try:
|
||||
savepoint = sess.begin_nested()
|
||||
|
||||
recipe_url = f'{site.base_url}/{recipe.identifier}'
|
||||
logging.info(f'Loading Recipe: {recipe_url}')
|
||||
try:
|
||||
with urlopen(recipe_url) as f:
|
||||
if f.getcode() == 404:
|
||||
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
||||
recipe_page = bs4.BeautifulSoup(f.read().decode())
|
||||
|
||||
|
||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
||||
if len(name_candidates) == 0:
|
||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
||||
name_div = name_candidates[0]
|
||||
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
||||
parse_recipe(sess, recipe, site)
|
||||
|
||||
savepoint.commit()
|
||||
except KeyboardInterrupt as e:
|
||||
savepoint.rollback()
|
||||
break
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||
logging.warning(e)
|
||||
continue
|
||||
|
||||
recipe.name = name_div.text
|
||||
logging.info(f"Adding Recipe {recipe}")
|
||||
savepoint.rollback()
|
||||
logging.error(e)
|
||||
break
|
||||
|
||||
sess.add(recipe)
|
||||
sess.flush()
|
||||
|
||||
ingredients = []
|
||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for ingredient in ingred_candidates:
|
||||
ingredients.append(db.RecipeIngredient(text=ingredient.text,
|
||||
recipe_id=recipe.id))
|
||||
|
||||
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
|
||||
sess.add_all(ingredients)
|
||||
|
||||
Loading…
Reference in New Issue