diff --git a/src/scrape.py b/src/scrape.py index f6eb24b..c85357b 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -1,5 +1,5 @@ import db -from sqlalchemy import select +from sqlalchemy import select, desc from sqlalchemy.orm import sessionmaker import bs4 from urllib.request import urlopen @@ -9,8 +9,10 @@ from argparse import ArgumentParser parser = ArgumentParser(description="Scrape a recipe site for recipies") parser.add_argument('site', help='Name of site') -parser.add_argument('identifier', - help='url of recipe(reletive to base url of site)') +parser.add_argument('-id', '--identifier', dest='id', + help='url of recipe(reletive to base url of site) or commma seperated list') +parser.add_argument('-a', '--auto', action='store', dest='n', + help='automaticaly generate identifier(must supply number of recipies to scrape)') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() @@ -22,21 +24,61 @@ eng = db.get_engine() S = sessionmaker(eng) with S.begin() as sess: - site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one() + site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() + site_id = site.id + + recipe_ids = [] + starting_id = 0 + if args.id and not args.n: + recipe_ids.append(args.id) + logging.info(f'Retreiving single recipe: {args.id}') + elif args.n: + if not args.id: + last_recipe = sess.query(db.Recipe).\ + where(db.Recipe.recipe_site_id == site.id).\ + order_by(desc(db.Recipe.identifier)).\ + limit(1).\ + scalar() + starting_id = int(last_recipe.identifier) + 1 + else: + starting_id = args.id + recipe_ids = range(starting_id, starting_id+int(args.n)) + logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') - recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id) - with urlopen(site.base_url + recipe.identifier) as f: - recipe_page = bs4.BeautifulSoup(f.read().decode()) + + + for recipe_id in recipe_ids: + recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) - name_div = recipe_page.find_all(class_=site.name_class)[0] - recipe.name = name_div.text - sess.add(recipe) - sess.flush() - logging.info(f"Adding Recipe {recipe}") + recipe_url = f'{site.base_url}/{recipe.identifier}' + logging.info(f'Loading Recipe: {recipe_url}') + try: + with urlopen(recipe_url) as f: + if f.getcode() == 404: + raise Exception(f"Recipe Does not exist: {recipe_url}") + recipe_page = bs4.BeautifulSoup(f.read().decode()) + + + name_candidates = recipe_page.find_all(class_=site.name_class) + if len(name_candidates) == 0: + raise Exception(f"Could not extract recipe name: {recipe_url}") + name_div = name_candidates[0] + except Exception as e: + logging.warning(f"Could not download or parse recipe: {recipe_url}") + logging.warning(e) + continue + + recipe.name = name_div.text + logging.info(f"Adding Recipe {recipe}") - ingredients = [] - for ingredient in recipe_page.find_all(class_=site.ingredient_class): - ingredients.append(db.RecipeIngredient(text=ingredient.text, - recipe_id=recipe.id)) - logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") - sess.add_all(ingredients) \ No newline at end of file + sess.add(recipe) + sess.flush() + + ingredients = [] + ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) + for ingredient in ingred_candidates: + ingredients.append(db.RecipeIngredient(text=ingredient.text, + recipe_id=recipe.id)) + + logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") + sess.add_all(ingredients) \ No newline at end of file