added missing units to regex + function for reparsing missing ingredent text

This commit is contained in:
Andrei Stoica 2022-07-22 15:12:16 -04:00
parent c4c53d238a
commit 6f3056bc1c
1 changed files with 26 additions and 5 deletions

View File

@ -1,6 +1,7 @@
from ast import alias
import db import db
import re import re
from sqlalchemy import select, desc from sqlalchemy import select, desc, exists, not_, except_
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import bs4 import bs4
@ -10,13 +11,15 @@ import logging
from argparse import ArgumentParser from argparse import ArgumentParser
def parse_ingredient(ingredient_text): def parse_ingredient(ingredient_text):
units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch'] units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch',
number_regex = '((?:[\d\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)' 'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
'bunch', 'quart', 'cube', 'envelope', 'squars', 'sprig']
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
ingredient_regex = '([a-zA-Z \'\-]+)' ingredient_regex = '([a-zA-Z \'\-]+)'
supplement_regex = ',?(.*)' supplement_regex = ',?(.*)'
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}' units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
for unit in units]) for unit in units])
units_regex = f"((?:{units_regex})[s]?)" units_regex = f"((?:(?:{units_regex})e?s?)?)"
regex = re.compile(number_regex + regex = re.compile(number_regex +
units_regex + units_regex +
@ -30,6 +33,24 @@ def parse_ingredient(ingredient_text):
return [text.strip() for text in m.groups()] return [text.strip() for text in m.groups()]
def reparse_ingredients(session):
cte = (except_(select(db.RecipeIngredient.id),
select(db.RecipeIngredientParts.id))).\
alias('missing')
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
for ingredient in missing:
parts = parse_ingredient(ingredient.text)
if not parts:
continue
quantity, unit, name, supplement = parts
session.add(db.RecipeIngredientParts(id = ingredient.id,
quantity = quantity,
unit = unit,
ingredient = name,
supplement = supplement))
def load_recipe(recipe_url): def load_recipe(recipe_url):
try: try:
logging.info(f'Loading Recipe: {recipe_url}') logging.info(f'Loading Recipe: {recipe_url}')
@ -79,7 +100,7 @@ def parse_recipe(session, recipe, site):
session.add(ingred_parts) session.add(ingred_parts)
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB") logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
return recipe return recipe