diff --git a/src/recipe_graph/scrape.py b/src/recipe_graph/scrape.py index 06ca61f..bdc4519 100644 --- a/src/recipe_graph/scrape.py +++ b/src/recipe_graph/scrape.py @@ -1,9 +1,7 @@ -from ast import alias -from dis import Instruction -import db +import sys +from recipe_graph import db import re from sqlalchemy import select, desc, exists, not_, except_ -from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker import bs4 from urllib.request import urlopen @@ -129,62 +127,64 @@ def parse_recipe(session, recipe, site): return recipe +def main(): + parser = ArgumentParser(description="Scrape a recipe site for recipies") + parser.add_argument('site', + help='Name of site') + parser.add_argument('-id', '--identifier', dest='id', + help='url of recipe(reletive to base url of site) or commma seperated list') + parser.add_argument('-a', '--auto', action='store', dest='n', + help='automaticaly generate identifier(must supply number of recipies to scrape)') + parser.add_argument('-v', '--verbose', action='store_true') -parser = ArgumentParser(description="Scrape a recipe site for recipies") -parser.add_argument('site', - help='Name of site') -parser.add_argument('-id', '--identifier', dest='id', - help='url of recipe(reletive to base url of site) or commma seperated list') -parser.add_argument('-a', '--auto', action='store', dest='n', - help='automaticaly generate identifier(must supply number of recipies to scrape)') -parser.add_argument('-v', '--verbose', action='store_true') + args = parser.parse_args(sys.argv) + if args.verbose: + logging.basicConfig(level=logging.INFO) + logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) -args = parser.parse_args() -if args.verbose: - logging.basicConfig(level=logging.INFO) - logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + eng = db.get_engine() + S = sessionmaker(eng) -eng = db.get_engine() -S = sessionmaker(eng) - -with S.begin() as sess: - site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() - site_id = site.id + with S.begin() as sess: + site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() + site_id = site.id + + recipe_ids = [] + starting_id = 0 + if args.id and not args.n: + recipe_ids.append(args.id) + logging.info(f'Retreiving single recipe: {args.id}') + elif args.n: + if not args.id: + last_recipe = sess.query(db.Recipe).\ + where(db.Recipe.recipe_site_id == site.id).\ + order_by(desc(db.Recipe.identifier)).\ + limit(1).\ + scalar() + starting_id = int(last_recipe.identifier) + 1 + else: + starting_id = int(args.id) + recipe_ids = range(starting_id, starting_id+int(args.n)) + logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') - recipe_ids = [] - starting_id = 0 - if args.id and not args.n: - recipe_ids.append(args.id) - logging.info(f'Retreiving single recipe: {args.id}') - elif args.n: - if not args.id: - last_recipe = sess.query(db.Recipe).\ - where(db.Recipe.recipe_site_id == site.id).\ - order_by(desc(db.Recipe.identifier)).\ - limit(1).\ - scalar() - starting_id = int(last_recipe.identifier) + 1 - else: - starting_id = int(args.id) - recipe_ids = range(starting_id, starting_id+int(args.n)) - logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') - - - - for recipe_id in recipe_ids: - try: - savepoint = sess.begin_nested() + + + for recipe_id in recipe_ids: + try: + savepoint = sess.begin_nested() - recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) - parse_recipe(sess, recipe, site) + recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) + parse_recipe(sess, recipe, site) - savepoint.commit() - except KeyboardInterrupt as e: - savepoint.rollback() - break - except Exception as e: - savepoint.rollback() - logging.error(e) - continue + savepoint.commit() + except KeyboardInterrupt as e: + savepoint.rollback() + break + except Exception as e: + savepoint.rollback() + logging.error(e) + continue - \ No newline at end of file + +if __name__ == "__main__": # pragma: no cover + main() \ No newline at end of file