refactor code to split into functions + preliminary regex for ingredient extraction

2022-07-22 12:26:58 -04:00
parent b5061caed5
commit c4c53d238a
2 changed files with 103 additions and 33 deletions
--- a/src/db.py
+++ b/src/db.py
@@ -1,3 +1,4 @@
 from typing import Text
 from sqlalchemy import create_engine, Column, Integer, String, \
                      ForeignKey, UniqueConstraint
 from sqlalchemy.engine import URL
@@ -41,6 +42,16 @@ class RecipeIngredient(Base):
    recipe_id = Column(Integer, ForeignKey('Recipe.id'))
    ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
 class RecipeIngredientParts(Base):
    __tablename__ = 'RecipeIngredientParts'
    id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
    quantity = Column(String)
    unit = Column(String)
    ingredient = Column(String)
    supplement = Column(String)
 def get_engine(use_dotenv = True, **kargs):
    if use_dotenv:
--- a/src/scrape.py
+++ b/src/scrape.py
@@ -1,11 +1,88 @@
 import db
 import re
 from sqlalchemy import select, desc
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import sessionmaker
 import bs4
 from urllib.request import urlopen
 from urllib.parse import urljoin
 import logging
 from argparse import ArgumentParser
 def parse_ingredient(ingredient_text):
    units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch']
    number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
    ingredient_regex = '([a-zA-Z \'\-]+)'
    supplement_regex = ',?(.*)'
    units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' 
                            for unit in units])
    units_regex = f"((?:{units_regex})[s]?)"
    regex = re.compile(number_regex + 
                       units_regex + 
                       ingredient_regex + 
                       supplement_regex)
    m = regex.match(ingredient_text)
    logging.info(f"Parsed {ingredient_text}, found: {m}")
    if not m:
        return None
    return [text.strip() for text in m.groups()]
 def load_recipe(recipe_url):
    try:    
        logging.info(f'Loading Recipe: {recipe_url}')
        with urlopen(recipe_url) as f:
            if f.getcode() == 404:
                raise Exception(f"Recipe Does not exist: {recipe_url}")
            return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
    except Exception as e:
        logging.warning(f"Could not download or parse recipe: {recipe_url}")
        logging.warning(e)
    return None
 def parse_recipe(session, recipe, site):
    recipe_url = urljoin(site.base_url, str(recipe.identifier))
    recipe_page = load_recipe(recipe_url)
    if not recipe_page:
        return None
    name_candidates = recipe_page.find_all(class_=site.name_class)
    if len(name_candidates) == 0:
        raise Exception(f"Could not extract recipe name: {recipe_url}")
    name_div = name_candidates[0]
    recipe.name = name_div.text
    logging.info(f"Adding Recipe {recipe}")   
    session.add(recipe)
    session.flush()
    ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
    for candidate in ingred_candidates:
        ingred = db.RecipeIngredient(text=candidate.text,
                                         recipe_id=recipe.id)
        session.add(ingred)  
        session.flush()
        parts = parse_ingredient(ingred.text)
        if parts:
            quantity, unit, ingredient, supplement = parts
            ingred_parts = db.RecipeIngredientParts(id = ingred.id,
                                                 quantity = quantity,
                                                 unit = unit,
                                                 ingredient = ingredient,
                                                 supplement = supplement)
            session.add(ingred_parts)
    logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
    return recipe
 parser = ArgumentParser(description="Scrape a recipe site for recipies")
 parser.add_argument('site',
                    help='Name of site')
@@ -41,44 +118,26 @@ with S.begin() as sess:
                            scalar()
            starting_id = int(last_recipe.identifier) + 1
        else:
-            starting_id = args.id
+            starting_id = int(args.id)
        recipe_ids = range(starting_id, starting_id+int(args.n))
        logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
    for recipe_id in recipe_ids:            
        recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
        recipe_url = f'{site.base_url}/{recipe.identifier}'
        logging.info(f'Loading Recipe: {recipe_url}')
        try: 
-            with urlopen(recipe_url) as f:
+            savepoint = sess.begin_nested()
                if f.getcode() == 404:
                    raise Exception(f"Recipe Does not exist: {recipe_url}")
                recipe_page = bs4.BeautifulSoup(f.read().decode())
            recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
            parse_recipe(sess, recipe, site)
-            name_candidates = recipe_page.find_all(class_=site.name_class)
+            savepoint.commit()
-            if len(name_candidates) == 0:
+        except KeyboardInterrupt as e:
-                raise Exception(f"Could not extract recipe name: {recipe_url}")
+            savepoint.rollback()
-            name_div = name_candidates[0]
+            break
        except Exception as e:
-            logging.warning(f"Could not download or parse recipe: {recipe_url}")
+            savepoint.rollback()
-            logging.warning(e)
+            logging.error(e)
-            continue
+            break
        recipe.name = name_div.text
        logging.info(f"Adding Recipe {recipe}")   
        sess.add(recipe)
        sess.flush()
        ingredients = []
        ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
        for ingredient in ingred_candidates:
            ingredients.append(db.RecipeIngredient(text=ingredient.text,
                                                recipe_id=recipe.id))
        logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
        sess.add_all(ingredients)