Automatic import of N recipes
This commit is contained in:
parent
54ffd79836
commit
b5061caed5
|
|
@ -1,5 +1,5 @@
|
||||||
import db
|
import db
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select, desc
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
import bs4
|
import bs4
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
@ -9,8 +9,10 @@ from argparse import ArgumentParser
|
||||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||||
parser.add_argument('site',
|
parser.add_argument('site',
|
||||||
help='Name of site')
|
help='Name of site')
|
||||||
parser.add_argument('identifier',
|
parser.add_argument('-id', '--identifier', dest='id',
|
||||||
help='url of recipe(reletive to base url of site)')
|
help='url of recipe(reletive to base url of site) or commma seperated list')
|
||||||
|
parser.add_argument('-a', '--auto', action='store', dest='n',
|
||||||
|
help='automaticaly generate identifier(must supply number of recipies to scrape)')
|
||||||
parser.add_argument('-v', '--verbose', action='store_true')
|
parser.add_argument('-v', '--verbose', action='store_true')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -22,21 +24,61 @@ eng = db.get_engine()
|
||||||
S = sessionmaker(eng)
|
S = sessionmaker(eng)
|
||||||
|
|
||||||
with S.begin() as sess:
|
with S.begin() as sess:
|
||||||
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one()
|
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
||||||
|
site_id = site.id
|
||||||
|
|
||||||
|
recipe_ids = []
|
||||||
|
starting_id = 0
|
||||||
|
if args.id and not args.n:
|
||||||
|
recipe_ids.append(args.id)
|
||||||
|
logging.info(f'Retreiving single recipe: {args.id}')
|
||||||
|
elif args.n:
|
||||||
|
if not args.id:
|
||||||
|
last_recipe = sess.query(db.Recipe).\
|
||||||
|
where(db.Recipe.recipe_site_id == site.id).\
|
||||||
|
order_by(desc(db.Recipe.identifier)).\
|
||||||
|
limit(1).\
|
||||||
|
scalar()
|
||||||
|
starting_id = int(last_recipe.identifier) + 1
|
||||||
|
else:
|
||||||
|
starting_id = args.id
|
||||||
|
recipe_ids = range(starting_id, starting_id+int(args.n))
|
||||||
|
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
||||||
|
|
||||||
recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id)
|
|
||||||
with urlopen(site.base_url + recipe.identifier) as f:
|
|
||||||
recipe_page = bs4.BeautifulSoup(f.read().decode())
|
for recipe_id in recipe_ids:
|
||||||
|
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
||||||
|
|
||||||
name_div = recipe_page.find_all(class_=site.name_class)[0]
|
recipe_url = f'{site.base_url}/{recipe.identifier}'
|
||||||
recipe.name = name_div.text
|
logging.info(f'Loading Recipe: {recipe_url}')
|
||||||
sess.add(recipe)
|
try:
|
||||||
sess.flush()
|
with urlopen(recipe_url) as f:
|
||||||
logging.info(f"Adding Recipe {recipe}")
|
if f.getcode() == 404:
|
||||||
|
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
||||||
|
recipe_page = bs4.BeautifulSoup(f.read().decode())
|
||||||
|
|
||||||
|
|
||||||
|
name_candidates = recipe_page.find_all(class_=site.name_class)
|
||||||
|
if len(name_candidates) == 0:
|
||||||
|
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
||||||
|
name_div = name_candidates[0]
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||||
|
logging.warning(e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
recipe.name = name_div.text
|
||||||
|
logging.info(f"Adding Recipe {recipe}")
|
||||||
|
|
||||||
ingredients = []
|
sess.add(recipe)
|
||||||
for ingredient in recipe_page.find_all(class_=site.ingredient_class):
|
sess.flush()
|
||||||
ingredients.append(db.RecipeIngredient(text=ingredient.text,
|
|
||||||
recipe_id=recipe.id))
|
ingredients = []
|
||||||
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
|
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||||
sess.add_all(ingredients)
|
for ingredient in ingred_candidates:
|
||||||
|
ingredients.append(db.RecipeIngredient(text=ingredient.text,
|
||||||
|
recipe_id=recipe.id))
|
||||||
|
|
||||||
|
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
|
||||||
|
sess.add_all(ingredients)
|
||||||
Loading…
Reference in New Issue