Automatic import of N recipes

2022-07-20 18:00:49 -04:00 · 2022-07-20 18:00:49 -04:00 · b5061caed5
parent 54ffd79836
commit b5061caed5
1 changed files with 60 additions and 18 deletions
--- a/src/scrape.py
+++ b/src/scrape.py
@ -1,5 +1,5 @@
 import db
-from sqlalchemy import select
+from sqlalchemy import select, desc
 from sqlalchemy.orm import sessionmaker
 import bs4
 from urllib.request import urlopen
@ -9,8 +9,10 @@ from argparse import ArgumentParser
 parser = ArgumentParser(description="Scrape a recipe site for recipies")
 parser.add_argument('site',
                    help='Name of site')
-parser.add_argument('identifier',
-                    help='url of recipe(reletive to base url of site)')
+parser.add_argument('-id', '--identifier', dest='id',
+                    help='url of recipe(reletive to base url of site) or commma seperated list')
+parser.add_argument('-a', '--auto', action='store', dest='n',
+                    help='automaticaly generate identifier(must supply number of recipies to scrape)')
 parser.add_argument('-v', '--verbose', action='store_true')

 args = parser.parse_args()
@ -22,21 +24,61 @@ eng = db.get_engine()
 S = sessionmaker(eng)

 with S.begin() as sess:
-    site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one()
+    site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
+    site_id = site.id
        
-    recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id)
-    with urlopen(site.base_url + recipe.identifier) as f:
-        recipe_page = bs4.BeautifulSoup(f.read().decode())
+    recipe_ids = []
+    starting_id = 0
+    if args.id and not args.n:
+        recipe_ids.append(args.id)
+        logging.info(f'Retreiving single recipe: {args.id}')
+    elif args.n:
+        if not args.id:
+            last_recipe =  sess.query(db.Recipe).\
+                            where(db.Recipe.recipe_site_id == site.id).\
+                            order_by(desc(db.Recipe.identifier)).\
+                            limit(1).\
+                            scalar()
+            starting_id = int(last_recipe.identifier) + 1
+        else:
+            starting_id = args.id
+        recipe_ids = range(starting_id, starting_id+int(args.n))
+        logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
    
-    name_div = recipe_page.find_all(class_=site.name_class)[0]
-    recipe.name = name_div.text
-    sess.add(recipe)
-    sess.flush()
-    logging.info(f"Adding Recipe {recipe}")   
    
-    ingredients = []
-    for ingredient in recipe_page.find_all(class_=site.ingredient_class):
-        ingredients.append(db.RecipeIngredient(text=ingredient.text,
-                                               recipe_id=recipe.id))
-    logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
-    sess.add_all(ingredients)
+        
+    for recipe_id in recipe_ids:            
+        recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
+
+        recipe_url = f'{site.base_url}/{recipe.identifier}'
+        logging.info(f'Loading Recipe: {recipe_url}')
+        try:
+            with urlopen(recipe_url) as f:
+                if f.getcode() == 404:
+                    raise Exception(f"Recipe Does not exist: {recipe_url}")
+                recipe_page = bs4.BeautifulSoup(f.read().decode())
+    
+    
+            name_candidates = recipe_page.find_all(class_=site.name_class)
+            if len(name_candidates) == 0:
+                raise Exception(f"Could not extract recipe name: {recipe_url}")
+            name_div = name_candidates[0]
+        except Exception as e:
+            logging.warning(f"Could not download or parse recipe: {recipe_url}")
+            logging.warning(e)
+            continue
+            
+        recipe.name = name_div.text
+        logging.info(f"Adding Recipe {recipe}")   
+
+        sess.add(recipe)
+        sess.flush()
+
+        ingredients = []
+        ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
+        for ingredient in ingred_candidates:
+            ingredients.append(db.RecipeIngredient(text=ingredient.text,
+                                                recipe_id=recipe.id))
+            
+        logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
+        sess.add_all(ingredients)