inital commit

This commit is contained in:
Andrei Stoica 2022-07-18 11:13:53 -04:00
commit cf2e1ed21f
12 changed files with 244 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
data/
*__pycache__
*env
*.code-workspace

34
README.md Normal file
View File

@ -0,0 +1,34 @@
# Recipe Graph
## Setup
Prerequisits
- Docker compose
- Python
Install python requirements
```sh
python -m pip installl -r requirements.txt
```
Start database
```sh
docker-compose up
```
Initialize database and recipe sites
```sh
python src/db.py
python src/inser_sites.py data/sites.json
```
## Usage
import new recipes
```sh
python src/scrape.py <SiteName> <RecipeIdentifier>
```
## TODO
- automate scraping
- matching ingredients to recipe ingredients
- extend importing funcionality to more websites

17
docker-compose.yml Normal file
View File

@ -0,0 +1,17 @@
services:
psql:
build:
context: docker/psql
dockerfile: Dockerfile
environment:
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB}
ports:
- "5432:5432"
volumes:
- "dbdata:/var/lib/postgresql/data"
volumes:
dbdata:

10
docker/psql/Dockerfile Normal file
View File

@ -0,0 +1,10 @@
FROM postgres:14
RUN apt-get update
RUN apt-get -y install python3 \
python3-pip \
postgresql-plpython3-14
RUN python3 -m pip install sentence-transformers
ADD init.sql /docker-entrypoint-initdb.d

1
docker/psql/init.sql Normal file
View File

@ -0,0 +1 @@
CREATE EXTENSION plpython3u

3
docker/psql/setup.sh Normal file
View File

@ -0,0 +1,3 @@
pip install sentence-transformers
apt-get update && apt-get install postgresql-plpython3

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
beautifulsoup4==4.11.1
greenlet==1.1.2
psycopg2-binary==2.9.3
PyMySQL==1.0.2
python-dotenv==0.20.0
soupsieve==2.3.2.post1
SQLAlchemy==1.4.39

64
src/db.py Normal file
View File

@ -0,0 +1,64 @@
from sqlalchemy import create_engine, Column, Integer, String, \
ForeignKey, UniqueConstraint
from sqlalchemy.engine import URL
from sqlalchemy.ext.declarative import declarative_base
import os
from dotenv import load_dotenv
import logging
Base = declarative_base()
class Ingredient(Base):
__tablename__ = 'Ingredient'
id = Column(Integer, primary_key = True)
name = Column(String, nullable = False)
class RecipeSite(Base):
__tablename__ = 'RecipeSite'
id = Column(Integer, primary_key = True)
name = Column(String, nullable = False, unique = True)
ingredient_class = Column(String, nullable = False)
name_class = Column(String, nullable = False)
base_url = Column(String, nullable = False, unique = True)
class Recipe(Base):
__tablename__ = 'Recipe'
id = Column(Integer, primary_key = True)
name = Column(String)
identifier = Column(String, nullable = False)
recipe_site_id = Column(Integer, ForeignKey('RecipeSite.id'))
UniqueConstraint(identifier, recipe_site_id)
class RecipeIngredient(Base):
__tablename__ = 'RecipeIngredient'
id = Column(Integer, primary_key = True)
text = Column(String, nullable = False)
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
def get_engine(use_dotenv = True, **kargs):
if use_dotenv:
load_dotenv()
DB_URL = os.getenv("POSTGRES_URL")
DB_USER = os.getenv("POSTGRES_USER")
DB_PASSWORD = os.getenv("POSTGRES_PASSWORD")
DB_NAME = os.getenv("POSTGRES_DB")
eng_url = URL.create('postgresql',
username=DB_USER,
password=DB_PASSWORD,
host=DB_URL,
database=DB_NAME)
return create_engine(eng_url)
if __name__ == "__main__":
eng = get_engine()
logging.info(f"Createing DB Tables: {eng.url}")
Base.metadata.create_all(eng, checkfirst=True)

13
src/func.sql Normal file
View File

@ -0,0 +1,13 @@
DROP FUNCTION IF EXISTS cos_sim;
CREATE FUNCTION cos_sim(a TEXT, b TEXT)
returns REAL
AS $$
from sentence_transformers import CrossEncoder, util
model_name = "cross-encoder/stsb-roberta-large"
if not SD.get(model_name):
SD[model_name] = CrossEncoder(model_name)
model = SD[model_name]
return model.predict([(a, b)])[0]
$$ LANGUAGE plpython3u;

29
src/insert_sites.py Normal file
View File

@ -0,0 +1,29 @@
from sqlalchemy.orm import sessionmaker
import db
import json
import argparse
import logging
parser = argparse.ArgumentParser(description='Import recipes into database')
parser.add_argument('file', type=str,
help='JSON file with recipe site information')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
with open(args.file) as f:
sites = json.load(f)
eng = db.get_engine()
S = sessionmaker(eng)
with S.begin() as session:
for site in sites:
logging.info(f"Adding {site}")
session.add(db.RecipeSite(**site))

42
src/scrape.py Normal file
View File

@ -0,0 +1,42 @@
import db
from sqlalchemy import select
from sqlalchemy.orm import sessionmaker
import bs4
from urllib.request import urlopen
import logging
from argparse import ArgumentParser
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
parser.add_argument('identifier',
help='url of recipe(reletive to base url of site)')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
eng = db.get_engine()
S = sessionmaker(eng)
with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one()
recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id)
with urlopen(site.base_url + recipe.identifier) as f:
recipe_page = bs4.BeautifulSoup(f.read().decode())
name_div = recipe_page.find_all(class_=site.name_class)[0]
recipe.name = name_div.text
sess.add(recipe)
sess.flush()
logging.info(f"Adding Recipe {recipe}")
ingredients = []
for ingredient in recipe_page.find_all(class_=site.ingredient_class):
ingredients.append(db.RecipeIngredient(text=ingredient.text,
recipe_id=recipe.id))
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
sess.add_all(ingredients)

20
src/triggers.sql Normal file
View File

@ -0,0 +1,20 @@
CREATE OR REPLACE FUNCTION recipe_ingredient_update()
RETURNS TRIGGER
AS
$$
BEGIN
WITH I AS (
SELECT "Ingredient".id, cos_sim(NEW.text, "Ingredient".name) as sim
FROM "Ingredient"
WHERE regexp_split_to_array(NEW.text, E'\\s+') && regexp_split_to_array("Ingredient".name, E'\\s+')
ORDER BY sim DESC
)
SELECT I.id INTO NEW.ingredient_id from I LIMIT 1;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER match_ingredient
BEFORE INSERT ON "RecipeIngredient"
FOR EACH ROW
EXECUTE FUNCTION recipe_ingredient_update();