refactoring

This commit is contained in:
Andrei Stoica 2022-10-15 14:40:16 -04:00
parent 294231dd48
commit 9a15f6c031
1 changed files with 56 additions and 56 deletions

View File

@ -1,9 +1,7 @@
from ast import alias
from dis import Instruction
import db
import sys
from recipe_graph import db
import re
from sqlalchemy import select, desc, exists, not_, except_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker
import bs4
from urllib.request import urlopen
@ -129,62 +127,64 @@ def parse_recipe(session, recipe, site):
return recipe
def main():
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
parser.add_argument('-id', '--identifier', dest='id',
help='url of recipe(reletive to base url of site) or commma seperated list')
parser.add_argument('-a', '--auto', action='store', dest='n',
help='automaticaly generate identifier(must supply number of recipies to scrape)')
parser.add_argument('-v', '--verbose', action='store_true')
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
parser.add_argument('-id', '--identifier', dest='id',
help='url of recipe(reletive to base url of site) or commma seperated list')
parser.add_argument('-a', '--auto', action='store', dest='n',
help='automaticaly generate identifier(must supply number of recipies to scrape)')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args(sys.argv)
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
eng = db.get_engine()
S = sessionmaker(eng)
eng = db.get_engine()
S = sessionmaker(eng)
with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id
with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id
recipe_ids = []
starting_id = 0
if args.id and not args.n:
recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}')
elif args.n:
if not args.id:
last_recipe = sess.query(db.Recipe).\
where(db.Recipe.recipe_site_id == site.id).\
order_by(desc(db.Recipe.identifier)).\
limit(1).\
scalar()
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
recipe_ids = []
starting_id = 0
if args.id and not args.n:
recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}')
elif args.n:
if not args.id:
last_recipe = sess.query(db.Recipe).\
where(db.Recipe.recipe_site_id == site.id).\
order_by(desc(db.Recipe.identifier)).\
limit(1).\
scalar()
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
for recipe_id in recipe_ids:
try:
savepoint = sess.begin_nested()
for recipe_id in recipe_ids:
try:
savepoint = sess.begin_nested()
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
parse_recipe(sess, recipe, site)
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
parse_recipe(sess, recipe, site)
savepoint.commit()
except KeyboardInterrupt as e:
savepoint.rollback()
break
except Exception as e:
savepoint.rollback()
logging.error(e)
continue
savepoint.commit()
except KeyboardInterrupt as e:
savepoint.rollback()
break
except Exception as e:
savepoint.rollback()
logging.error(e)
continue
if __name__ == "__main__": # pragma: no cover
main()