From cf2e1ed21f2cf037e02786212b854ab26bf6fdae Mon Sep 17 00:00:00 2001 From: Andrei Stoica Date: Mon, 18 Jul 2022 11:13:53 -0400 Subject: [PATCH] inital commit --- .gitignore | 4 +++ README.md | 34 ++++++++++++++++++++++ docker-compose.yml | 17 +++++++++++ docker/psql/Dockerfile | 10 +++++++ docker/psql/init.sql | 1 + docker/psql/setup.sh | 3 ++ requirements.txt | 7 +++++ src/db.py | 64 ++++++++++++++++++++++++++++++++++++++++++ src/func.sql | 13 +++++++++ src/insert_sites.py | 29 +++++++++++++++++++ src/scrape.py | 42 +++++++++++++++++++++++++++ src/triggers.sql | 20 +++++++++++++ 12 files changed, 244 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 docker/psql/Dockerfile create mode 100644 docker/psql/init.sql create mode 100644 docker/psql/setup.sh create mode 100644 requirements.txt create mode 100644 src/db.py create mode 100644 src/func.sql create mode 100644 src/insert_sites.py create mode 100644 src/scrape.py create mode 100644 src/triggers.sql diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..deb14e9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +data/ +*__pycache__ +*env +*.code-workspace \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c10145 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Recipe Graph + +## Setup +Prerequisits + - Docker compose + - Python + +Install python requirements +```sh +python -m pip installl -r requirements.txt +``` + +Start database +```sh +docker-compose up +``` + +Initialize database and recipe sites +```sh +python src/db.py +python src/inser_sites.py data/sites.json +``` + +## Usage +import new recipes +```sh +python src/scrape.py +``` + + +## TODO +- automate scraping +- matching ingredients to recipe ingredients +- extend importing funcionality to more websites \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6bd1d5e --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +services: + psql: + build: + context: docker/psql + dockerfile: Dockerfile + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + + ports: + - "5432:5432" + volumes: + - "dbdata:/var/lib/postgresql/data" + +volumes: + dbdata: diff --git a/docker/psql/Dockerfile b/docker/psql/Dockerfile new file mode 100644 index 0000000..a747e9c --- /dev/null +++ b/docker/psql/Dockerfile @@ -0,0 +1,10 @@ +FROM postgres:14 + +RUN apt-get update +RUN apt-get -y install python3 \ + python3-pip \ + postgresql-plpython3-14 + +RUN python3 -m pip install sentence-transformers + +ADD init.sql /docker-entrypoint-initdb.d diff --git a/docker/psql/init.sql b/docker/psql/init.sql new file mode 100644 index 0000000..7d242d5 --- /dev/null +++ b/docker/psql/init.sql @@ -0,0 +1 @@ +CREATE EXTENSION plpython3u \ No newline at end of file diff --git a/docker/psql/setup.sh b/docker/psql/setup.sh new file mode 100644 index 0000000..9eee243 --- /dev/null +++ b/docker/psql/setup.sh @@ -0,0 +1,3 @@ +pip install sentence-transformers + +apt-get update && apt-get install postgresql-plpython3 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b4b5e06 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.11.1 +greenlet==1.1.2 +psycopg2-binary==2.9.3 +PyMySQL==1.0.2 +python-dotenv==0.20.0 +soupsieve==2.3.2.post1 +SQLAlchemy==1.4.39 diff --git a/src/db.py b/src/db.py new file mode 100644 index 0000000..68047a1 --- /dev/null +++ b/src/db.py @@ -0,0 +1,64 @@ +from sqlalchemy import create_engine, Column, Integer, String, \ + ForeignKey, UniqueConstraint +from sqlalchemy.engine import URL +from sqlalchemy.ext.declarative import declarative_base +import os +from dotenv import load_dotenv +import logging + + +Base = declarative_base() + +class Ingredient(Base): + __tablename__ = 'Ingredient' + + id = Column(Integer, primary_key = True) + name = Column(String, nullable = False) + +class RecipeSite(Base): + __tablename__ = 'RecipeSite' + + id = Column(Integer, primary_key = True) + name = Column(String, nullable = False, unique = True) + ingredient_class = Column(String, nullable = False) + name_class = Column(String, nullable = False) + base_url = Column(String, nullable = False, unique = True) + +class Recipe(Base): + __tablename__ = 'Recipe' + + id = Column(Integer, primary_key = True) + name = Column(String) + identifier = Column(String, nullable = False) + recipe_site_id = Column(Integer, ForeignKey('RecipeSite.id')) + UniqueConstraint(identifier, recipe_site_id) + +class RecipeIngredient(Base): + __tablename__ = 'RecipeIngredient' + + id = Column(Integer, primary_key = True) + text = Column(String, nullable = False) + recipe_id = Column(Integer, ForeignKey('Recipe.id')) + ingredient_id = Column(Integer, ForeignKey("Ingredient.id")) + + +def get_engine(use_dotenv = True, **kargs): + if use_dotenv: + load_dotenv() + DB_URL = os.getenv("POSTGRES_URL") + DB_USER = os.getenv("POSTGRES_USER") + DB_PASSWORD = os.getenv("POSTGRES_PASSWORD") + DB_NAME = os.getenv("POSTGRES_DB") + + eng_url = URL.create('postgresql', + username=DB_USER, + password=DB_PASSWORD, + host=DB_URL, + database=DB_NAME) + return create_engine(eng_url) + + +if __name__ == "__main__": + eng = get_engine() + logging.info(f"Createing DB Tables: {eng.url}") + Base.metadata.create_all(eng, checkfirst=True) diff --git a/src/func.sql b/src/func.sql new file mode 100644 index 0000000..1a1d7ac --- /dev/null +++ b/src/func.sql @@ -0,0 +1,13 @@ +DROP FUNCTION IF EXISTS cos_sim; +CREATE FUNCTION cos_sim(a TEXT, b TEXT) +returns REAL +AS $$ + from sentence_transformers import CrossEncoder, util + model_name = "cross-encoder/stsb-roberta-large" + + if not SD.get(model_name): + SD[model_name] = CrossEncoder(model_name) + model = SD[model_name] + + return model.predict([(a, b)])[0] +$$ LANGUAGE plpython3u; \ No newline at end of file diff --git a/src/insert_sites.py b/src/insert_sites.py new file mode 100644 index 0000000..a3a7107 --- /dev/null +++ b/src/insert_sites.py @@ -0,0 +1,29 @@ +from sqlalchemy.orm import sessionmaker +import db +import json +import argparse +import logging + +parser = argparse.ArgumentParser(description='Import recipes into database') +parser.add_argument('file', type=str, + help='JSON file with recipe site information') +parser.add_argument('-v', '--verbose', action='store_true') + + +args = parser.parse_args() +if args.verbose: + logging.basicConfig(level=logging.INFO) + logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + +with open(args.file) as f: + sites = json.load(f) + +eng = db.get_engine() +S = sessionmaker(eng) + +with S.begin() as session: + for site in sites: + logging.info(f"Adding {site}") + session.add(db.RecipeSite(**site)) + + \ No newline at end of file diff --git a/src/scrape.py b/src/scrape.py new file mode 100644 index 0000000..f6eb24b --- /dev/null +++ b/src/scrape.py @@ -0,0 +1,42 @@ +import db +from sqlalchemy import select +from sqlalchemy.orm import sessionmaker +import bs4 +from urllib.request import urlopen +import logging +from argparse import ArgumentParser + +parser = ArgumentParser(description="Scrape a recipe site for recipies") +parser.add_argument('site', + help='Name of site') +parser.add_argument('identifier', + help='url of recipe(reletive to base url of site)') +parser.add_argument('-v', '--verbose', action='store_true') + +args = parser.parse_args() +if args.verbose: + logging.basicConfig(level=logging.INFO) + logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + +eng = db.get_engine() +S = sessionmaker(eng) + +with S.begin() as sess: + site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one() + + recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id) + with urlopen(site.base_url + recipe.identifier) as f: + recipe_page = bs4.BeautifulSoup(f.read().decode()) + + name_div = recipe_page.find_all(class_=site.name_class)[0] + recipe.name = name_div.text + sess.add(recipe) + sess.flush() + logging.info(f"Adding Recipe {recipe}") + + ingredients = [] + for ingredient in recipe_page.find_all(class_=site.ingredient_class): + ingredients.append(db.RecipeIngredient(text=ingredient.text, + recipe_id=recipe.id)) + logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") + sess.add_all(ingredients) \ No newline at end of file diff --git a/src/triggers.sql b/src/triggers.sql new file mode 100644 index 0000000..434f2f8 --- /dev/null +++ b/src/triggers.sql @@ -0,0 +1,20 @@ +CREATE OR REPLACE FUNCTION recipe_ingredient_update() +RETURNS TRIGGER +AS +$$ +BEGIN + WITH I AS ( + SELECT "Ingredient".id, cos_sim(NEW.text, "Ingredient".name) as sim + FROM "Ingredient" + WHERE regexp_split_to_array(NEW.text, E'\\s+') && regexp_split_to_array("Ingredient".name, E'\\s+') + ORDER BY sim DESC + ) + SELECT I.id INTO NEW.ingredient_id from I LIMIT 1; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE TRIGGER match_ingredient + BEFORE INSERT ON "RecipeIngredient" + FOR EACH ROW + EXECUTE FUNCTION recipe_ingredient_update(); \ No newline at end of file