Automatic import of N recipes

This commit is contained in:
Andrei Stoica 2022-07-20 18:00:49 -04:00
parent 54ffd79836
commit b5061caed5
1 changed files with 60 additions and 18 deletions

View File

@ -1,5 +1,5 @@
import db import db
from sqlalchemy import select from sqlalchemy import select, desc
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import bs4 import bs4
from urllib.request import urlopen from urllib.request import urlopen
@ -9,8 +9,10 @@ from argparse import ArgumentParser
parser = ArgumentParser(description="Scrape a recipe site for recipies") parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site', parser.add_argument('site',
help='Name of site') help='Name of site')
parser.add_argument('identifier', parser.add_argument('-id', '--identifier', dest='id',
help='url of recipe(reletive to base url of site)') help='url of recipe(reletive to base url of site) or commma seperated list')
parser.add_argument('-a', '--auto', action='store', dest='n',
help='automaticaly generate identifier(must supply number of recipies to scrape)')
parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args() args = parser.parse_args()
@ -22,21 +24,61 @@ eng = db.get_engine()
S = sessionmaker(eng) S = sessionmaker(eng)
with S.begin() as sess: with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one() site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id
recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id) recipe_ids = []
with urlopen(site.base_url + recipe.identifier) as f: starting_id = 0
if args.id and not args.n:
recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}')
elif args.n:
if not args.id:
last_recipe = sess.query(db.Recipe).\
where(db.Recipe.recipe_site_id == site.id).\
order_by(desc(db.Recipe.identifier)).\
limit(1).\
scalar()
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = args.id
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
for recipe_id in recipe_ids:
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
recipe_url = f'{site.base_url}/{recipe.identifier}'
logging.info(f'Loading Recipe: {recipe_url}')
try:
with urlopen(recipe_url) as f:
if f.getcode() == 404:
raise Exception(f"Recipe Does not exist: {recipe_url}")
recipe_page = bs4.BeautifulSoup(f.read().decode()) recipe_page = bs4.BeautifulSoup(f.read().decode())
name_div = recipe_page.find_all(class_=site.name_class)[0]
name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e)
continue
recipe.name = name_div.text recipe.name = name_div.text
sess.add(recipe)
sess.flush()
logging.info(f"Adding Recipe {recipe}") logging.info(f"Adding Recipe {recipe}")
sess.add(recipe)
sess.flush()
ingredients = [] ingredients = []
for ingredient in recipe_page.find_all(class_=site.ingredient_class): ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for ingredient in ingred_candidates:
ingredients.append(db.RecipeIngredient(text=ingredient.text, ingredients.append(db.RecipeIngredient(text=ingredient.text,
recipe_id=recipe.id)) recipe_id=recipe.id))
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
sess.add_all(ingredients) sess.add_all(ingredients)