Compare commits
40 Commits
04200b41ce
...
c4d5b3a7bf
| Author | SHA1 | Date |
|---|---|---|
|
|
c4d5b3a7bf | |
|
|
9d0413ada5 | |
|
|
a04bb06ed8 | |
|
|
cf05777f2c | |
|
|
209597432d | |
|
|
e207c359ed | |
|
|
35fadd6638 | |
|
|
259c08fd4e | |
|
|
e6d150421f | |
|
|
76e0438062 | |
|
|
794dbe7d88 | |
|
|
b6daacca2d | |
|
|
719b544007 | |
|
|
03ecae4be5 | |
|
|
5201a444e9 | |
|
|
d60634dff2 | |
|
|
a9970552d6 | |
|
|
51d631daf6 | |
|
|
b9e754c984 | |
|
|
a568fb244e | |
|
|
98f96543e6 | |
|
|
9a15f6c031 | |
|
|
294231dd48 | |
|
|
88b9707201 | |
|
|
c34af93533 | |
|
|
9b9e629548 | |
|
|
6189de8039 | |
|
|
3a45cfb02a | |
|
|
a5153e2406 | |
|
|
4c96bd8a28 | |
|
|
fe91134050 | |
|
|
c32459f0df | |
|
|
f290f49248 | |
|
|
754cb1235c | |
|
|
082c342256 | |
|
|
c6a75b59eb | |
|
|
a87c0f142e | |
|
|
d96476662b | |
|
|
c30fea1ddc | |
|
|
4f61fb5ccc |
|
|
@ -0,0 +1,66 @@
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: test
|
||||||
|
environment:
|
||||||
|
project_name: rgraph
|
||||||
|
trigger:
|
||||||
|
event:
|
||||||
|
include:
|
||||||
|
- push
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: db-up
|
||||||
|
image: docker/compose:alpine-1.29.2
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER:
|
||||||
|
from_secret: TESTING_USER
|
||||||
|
POSTGRES_PASSWORD:
|
||||||
|
from_secret: TESTING_PASSWORD
|
||||||
|
POSTGRES_DB:
|
||||||
|
from_secret: TESTING_DB
|
||||||
|
volumes:
|
||||||
|
- name: docker_sock
|
||||||
|
path: /var/run/docker.sock
|
||||||
|
commands:
|
||||||
|
- docker-compose -p rgraph-test up -d
|
||||||
|
|
||||||
|
- name: requirements
|
||||||
|
image: python:3.10-alpine
|
||||||
|
commands:
|
||||||
|
- python -m venv .venv
|
||||||
|
- . .venv/bin/activate
|
||||||
|
- pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: build
|
||||||
|
image: python:3.10-alpine
|
||||||
|
commands:
|
||||||
|
- . .venv/bin/activate
|
||||||
|
- pip install .
|
||||||
|
|
||||||
|
- name: test
|
||||||
|
image: python:3.10-alpine
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER:
|
||||||
|
from_secret: TESTING_USER
|
||||||
|
POSTGRES_PASSWORD:
|
||||||
|
from_secret: TESTING_PASSWORD
|
||||||
|
POSTGRES_DB:
|
||||||
|
from_secret: TESTING_DB
|
||||||
|
commands:
|
||||||
|
- hostip=$(ip route show | awk '/default/ {print $3}')
|
||||||
|
- export POSTGRES_URL=$hostip
|
||||||
|
- . .venv/bin/activate
|
||||||
|
- pytest
|
||||||
|
|
||||||
|
- name: db-cleanup
|
||||||
|
image: docker/compose:alpine-1.29.2
|
||||||
|
volumes:
|
||||||
|
- name: docker_sock
|
||||||
|
path: /var/run/docker.sock
|
||||||
|
commands:
|
||||||
|
- docker-compose -p rgraph-test down
|
||||||
|
- docker volume rm rgraph-test_dbdata
|
||||||
|
volumes:
|
||||||
|
- name: docker_sock
|
||||||
|
host:
|
||||||
|
path: /var/run/docker.sock
|
||||||
|
|
@ -2,4 +2,12 @@ data/
|
||||||
*__pycache__
|
*__pycache__
|
||||||
*env
|
*env
|
||||||
*.code-workspace
|
*.code-workspace
|
||||||
sandbox/
|
sandbox/
|
||||||
|
htmlcov
|
||||||
|
.coverage
|
||||||
|
*.lcov
|
||||||
|
.vscode/
|
||||||
|
*.pytest_cache/
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
|
|
||||||
51
README.md
51
README.md
|
|
@ -20,7 +20,19 @@ POSTGRES_DB=rgraph
|
||||||
|
|
||||||
Start database
|
Start database
|
||||||
```sh
|
```sh
|
||||||
docker-compose up
|
docker-compose -p recipe-dev up
|
||||||
|
```
|
||||||
|
|
||||||
|
Example `sites.json`
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "Example Site Name",
|
||||||
|
"ingredient_class": "example-ingredients-item-name",
|
||||||
|
"name_class" : "example-heading-content",
|
||||||
|
"base_url" : "https://www.example.com/recipe/"
|
||||||
|
}
|
||||||
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Initialize database and recipe sites
|
Initialize database and recipe sites
|
||||||
|
|
@ -29,6 +41,11 @@ python src/db.py
|
||||||
python src/insert_sites.py data/sites.json
|
python src/insert_sites.py data/sites.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Shutdown database
|
||||||
|
```sh
|
||||||
|
docker-compose -p recipe-dev down
|
||||||
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
### Scrape
|
### Scrape
|
||||||
import new recipes
|
import new recipes
|
||||||
|
|
@ -60,6 +77,38 @@ options:
|
||||||
-v, --verbose
|
-v, --verbose
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
For testing create a new set up docker containers. Tests will fail if
|
||||||
|
the database is already initiated.
|
||||||
|
|
||||||
|
Starting testing db
|
||||||
|
```sh
|
||||||
|
docker-compose -p recipe-test up
|
||||||
|
```
|
||||||
|
|
||||||
|
running tests
|
||||||
|
```sh
|
||||||
|
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
|
||||||
|
```
|
||||||
|
|
||||||
|
The html report is under `htmlcov/` and can be viewed through any browser.
|
||||||
|
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
|
||||||
|
plugin for VS Code to view coverage in your editor.
|
||||||
|
|
||||||
|
**WARNINING**: If you get `ERROR at setup of test_db_connection` and
|
||||||
|
`ERROR at setup of test_db_class_creation`, please check if testing database is
|
||||||
|
already initiated. Testing is destructive and should be done on a fresh database.
|
||||||
|
|
||||||
|
|
||||||
|
Shutting down testing db
|
||||||
|
```sh
|
||||||
|
docker-compose -p recipe-test down
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Test are written in pytest framework. Currently focused on unittest and code
|
||||||
|
coverage. Integration tests to come.
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
> ☑ automate scraping\
|
> ☑ automate scraping\
|
||||||
> ☑ extracting quantity and name (via regex)\
|
> ☑ extracting quantity and name (via regex)\
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "recipe_graph"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "mapping out recipes relations"
|
||||||
|
dependencies = [
|
||||||
|
"SQLAlchemy==1.4.39",
|
||||||
|
"python-dotenv==0.20.0",
|
||||||
|
"beautifulsoup4==4.11.1",
|
||||||
|
"psycopg2-binary==2.9.3",
|
||||||
|
"requests~=2.30.0"
|
||||||
|
]
|
||||||
|
|
@ -1,7 +1,18 @@
|
||||||
|
attrs==22.1.0
|
||||||
beautifulsoup4==4.11.1
|
beautifulsoup4==4.11.1
|
||||||
|
coverage==6.5.0
|
||||||
greenlet==1.1.2
|
greenlet==1.1.2
|
||||||
|
iniconfig==1.1.1
|
||||||
|
packaging==21.3
|
||||||
|
pluggy==1.0.0
|
||||||
psycopg2-binary==2.9.3
|
psycopg2-binary==2.9.3
|
||||||
|
py==1.11.0
|
||||||
PyMySQL==1.0.2
|
PyMySQL==1.0.2
|
||||||
|
pyparsing==3.0.9
|
||||||
|
pytest==7.1.3
|
||||||
|
pytest-cov==4.0.0
|
||||||
python-dotenv==0.20.0
|
python-dotenv==0.20.0
|
||||||
|
requests~=2.30.0
|
||||||
soupsieve==2.3.2.post1
|
soupsieve==2.3.2.post1
|
||||||
SQLAlchemy==1.4.39
|
SQLAlchemy==1.4.39
|
||||||
|
tomli==2.0.1
|
||||||
|
|
|
||||||
219
src/db.py
219
src/db.py
|
|
@ -1,219 +0,0 @@
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from types import NoneType
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from xmlrpc.client import Boolean
|
|
||||||
from sqlalchemy import create_engine, Column, Integer, String, Boolean, \
|
|
||||||
ForeignKey, UniqueConstraint, func, select, and_, or_, \
|
|
||||||
not_
|
|
||||||
from sqlalchemy.types import ARRAY
|
|
||||||
from sqlalchemy.engine import URL
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.orm import Session
|
|
||||||
|
|
||||||
|
|
||||||
Base = declarative_base()
|
|
||||||
|
|
||||||
class Ingredient(Base):
|
|
||||||
__tablename__ = 'Ingredient'
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key = True)
|
|
||||||
name = Column(String, nullable = False)
|
|
||||||
|
|
||||||
class RecipeSite(Base):
|
|
||||||
__tablename__ = 'RecipeSite'
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key = True)
|
|
||||||
name = Column(String, nullable = False, unique = True)
|
|
||||||
ingredient_class = Column(String, nullable = False)
|
|
||||||
name_class = Column(String, nullable = False)
|
|
||||||
base_url = Column(String, nullable = False, unique = True)
|
|
||||||
|
|
||||||
class Recipe(Base):
|
|
||||||
__tablename__ = 'Recipe'
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key = True)
|
|
||||||
name = Column(String)
|
|
||||||
identifier = Column(String, nullable = False)
|
|
||||||
recipe_site_id = Column(Integer, ForeignKey('RecipeSite.id'))
|
|
||||||
UniqueConstraint(identifier, recipe_site_id)
|
|
||||||
|
|
||||||
class RecipeIngredient(Base):
|
|
||||||
__tablename__ = 'RecipeIngredient'
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key = True)
|
|
||||||
text = Column(String, nullable = False)
|
|
||||||
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
|
|
||||||
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
|
|
||||||
|
|
||||||
class RecipeIngredientParts(Base):
|
|
||||||
__tablename__ = 'RecipeIngredientParts'
|
|
||||||
|
|
||||||
id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
|
|
||||||
quantity = Column(String)
|
|
||||||
unit = Column(String)
|
|
||||||
instruction = Column(String)
|
|
||||||
ingredient = Column(String)
|
|
||||||
supplement = Column(String)
|
|
||||||
|
|
||||||
class IngredientConnection(Base):
|
|
||||||
__tablename__ = 'IngredientConnection'
|
|
||||||
|
|
||||||
ingredient_a = Column(String,
|
|
||||||
ForeignKey("RecipeIngredientParts.ingredient"),
|
|
||||||
primary_key = True)
|
|
||||||
ingredient_b = Column(String,
|
|
||||||
ForeignKey("RecipeIngredientParts.ingredient"),
|
|
||||||
primary_key = True)
|
|
||||||
recipe_count = Column(Integer)
|
|
||||||
UniqueConstraint(ingredient_a, ingredient_b)
|
|
||||||
|
|
||||||
class RecipeConnection(Base):
|
|
||||||
__tablename__ = 'RecipeConnection'
|
|
||||||
|
|
||||||
recipe_a = Column(Integer,
|
|
||||||
ForeignKey("Recipe.id"),
|
|
||||||
primary_key = True)
|
|
||||||
recipe_b = Column(Integer,
|
|
||||||
ForeignKey("Recipe.id"),
|
|
||||||
primary_key = True)
|
|
||||||
ingredient_count = Column(Integer)
|
|
||||||
|
|
||||||
class RecipeGraphed(Base):
|
|
||||||
__tablename__ = "RecipeGraphed"
|
|
||||||
|
|
||||||
recipe_id = Column(Integer, ForeignKey("Recipe.id"), primary_key = True)
|
|
||||||
status = Column(Boolean, nullable = False, default = False)
|
|
||||||
|
|
||||||
|
|
||||||
def get_engine(use_dotenv = True, **kargs):
|
|
||||||
if use_dotenv:
|
|
||||||
load_dotenv()
|
|
||||||
DB_URL = os.getenv("POSTGRES_URL")
|
|
||||||
DB_USER = os.getenv("POSTGRES_USER")
|
|
||||||
DB_PASSWORD = os.getenv("POSTGRES_PASSWORD")
|
|
||||||
DB_NAME = os.getenv("POSTGRES_DB")
|
|
||||||
|
|
||||||
eng_url = URL.create('postgresql',
|
|
||||||
username=DB_USER,
|
|
||||||
password=DB_PASSWORD,
|
|
||||||
host=DB_URL,
|
|
||||||
database=DB_NAME)
|
|
||||||
return create_engine(eng_url)
|
|
||||||
|
|
||||||
|
|
||||||
def create_tables(eng):
|
|
||||||
logging.info(f"Createing DB Tables: {eng.url}")
|
|
||||||
Base.metadata.create_all(eng, checkfirst=True)
|
|
||||||
|
|
||||||
def pair_query(pairable, groupable, recipe_ids = None, pair_type = String):
|
|
||||||
pair_func= func.text_pairs
|
|
||||||
if pair_type == Integer:
|
|
||||||
pair_func=func.int_pairs
|
|
||||||
|
|
||||||
new_pairs = select(groupable,
|
|
||||||
pair_func(func.array_agg(pairable.distinct()),
|
|
||||||
type_=ARRAY(pair_type)).label("pair"))\
|
|
||||||
.join(RecipeIngredientParts)
|
|
||||||
|
|
||||||
if not type(recipe_ids) == NoneType:
|
|
||||||
new_pairs = new_pairs.where(RecipeIngredient.recipe_id.in_(recipe_ids))
|
|
||||||
|
|
||||||
new_pairs = new_pairs.group_by(groupable)\
|
|
||||||
.cte()
|
|
||||||
|
|
||||||
return new_pairs
|
|
||||||
|
|
||||||
def pair_count_query(pairs, countable, recipe_ids = None):
|
|
||||||
new_counts = select(pairs, func.count(func.distinct(countable)))
|
|
||||||
|
|
||||||
if not type(recipe_ids) == NoneType:
|
|
||||||
new_counts = new_counts.where(or_(pairs[0].in_(recipe_ids),
|
|
||||||
pairs[1].in_(recipe_ids)))
|
|
||||||
|
|
||||||
|
|
||||||
new_counts = new_counts.group_by(pairs)
|
|
||||||
|
|
||||||
return new_counts
|
|
||||||
|
|
||||||
def update_graph_connectivity(session = None):
|
|
||||||
# this is pure SQLAlchemy so it is more portable
|
|
||||||
# This would have been simpler if I utilized Postgres specific feature
|
|
||||||
if not session:
|
|
||||||
session = Session(get_engine())
|
|
||||||
|
|
||||||
with session.begin():
|
|
||||||
ids = select(Recipe.id)\
|
|
||||||
.join(RecipeGraphed, isouter = True)\
|
|
||||||
.where(RecipeGraphed.status.is_not(True))
|
|
||||||
|
|
||||||
num_recipes = session.execute(select(func.count('*')).select_from(ids.cte())).fetchone()[0]
|
|
||||||
if num_recipes <= 0:
|
|
||||||
logging.info("no new recipies")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f"adding {num_recipes} recipes to the graphs")
|
|
||||||
|
|
||||||
new_pairs = pair_query(RecipeIngredientParts.ingredient,
|
|
||||||
RecipeIngredient.recipe_id,
|
|
||||||
recipe_ids = ids)
|
|
||||||
|
|
||||||
|
|
||||||
new_counts = pair_count_query(new_pairs.c.pair,
|
|
||||||
new_pairs.c.recipe_id)
|
|
||||||
|
|
||||||
logging.info("addeing new ingredient connections")
|
|
||||||
for pair, count in session.execute(new_counts):
|
|
||||||
connection = session.query(IngredientConnection)\
|
|
||||||
.where(and_(IngredientConnection.ingredient_a == pair[0],
|
|
||||||
IngredientConnection.ingredient_b == pair[1]))\
|
|
||||||
.first()
|
|
||||||
if connection:
|
|
||||||
connection.recipe_count += count
|
|
||||||
session.merge(connection)
|
|
||||||
else:
|
|
||||||
session.add(IngredientConnection(ingredient_a = pair[0],
|
|
||||||
ingredient_b = pair[1],
|
|
||||||
recipe_count = count))
|
|
||||||
|
|
||||||
# update RecipeConnection
|
|
||||||
logging.info("adding new recipe connections")
|
|
||||||
all_pairs = pair_query(RecipeIngredient.recipe_id,
|
|
||||||
RecipeIngredientParts.ingredient,
|
|
||||||
pair_type=Integer)
|
|
||||||
|
|
||||||
new_counts = pair_count_query(all_pairs.c.pair,
|
|
||||||
all_pairs.c.ingredient,
|
|
||||||
recipe_ids=ids)
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
for pair, count in session.execute(new_counts):
|
|
||||||
session.add(RecipeConnection(recipe_a = pair[0],
|
|
||||||
recipe_b = pair[1],
|
|
||||||
ingredient_count = count))
|
|
||||||
# flush often to reduce memory usage
|
|
||||||
i += 1
|
|
||||||
if (i % 100000) == 0:
|
|
||||||
session.flush()
|
|
||||||
|
|
||||||
# update RecipeGraphed.status
|
|
||||||
logging.info("updating existing RecipeGraphed rows")
|
|
||||||
for recipeGraphed in session.query(RecipeGraphed)\
|
|
||||||
.where(RecipeGraphed.recipe_id.in_(ids)):
|
|
||||||
recipeGraphed.status = True
|
|
||||||
session.merge(recipeGraphed)
|
|
||||||
|
|
||||||
graphed = select(RecipeGraphed.recipe_id)
|
|
||||||
|
|
||||||
# add recipies that aren't in the table
|
|
||||||
logging.info("adding new RecipeGraphed rows")
|
|
||||||
for recipe in session.query(Recipe)\
|
|
||||||
.where(and_(Recipe.id.in_(ids),
|
|
||||||
not_(Recipe.id.in_(graphed)))):
|
|
||||||
session.add(RecipeGraphed(recipe_id=recipe.id, status=True))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
eng = get_engine()
|
|
||||||
create_tables(eng)
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
import db
|
|
||||||
import json
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Import recipes into database')
|
|
||||||
parser.add_argument('file', type=str,
|
|
||||||
help='JSON file with recipe site information')
|
|
||||||
parser.add_argument('-v', '--verbose', action='store_true')
|
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
if args.verbose:
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
|
||||||
|
|
||||||
with open(args.file) as f:
|
|
||||||
sites = json.load(f)
|
|
||||||
|
|
||||||
eng = db.get_engine()
|
|
||||||
S = sessionmaker(eng)
|
|
||||||
|
|
||||||
with S.begin() as session:
|
|
||||||
for site in sites:
|
|
||||||
logging.info(f"Adding {site}")
|
|
||||||
session.add(db.RecipeSite(**site))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,259 @@
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from types import NoneType
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from xmlrpc.client import Boolean
|
||||||
|
from sqlalchemy import (
|
||||||
|
create_engine,
|
||||||
|
Column,
|
||||||
|
Integer,
|
||||||
|
String,
|
||||||
|
Boolean,
|
||||||
|
ForeignKey,
|
||||||
|
UniqueConstraint,
|
||||||
|
func,
|
||||||
|
select,
|
||||||
|
and_,
|
||||||
|
or_,
|
||||||
|
not_,
|
||||||
|
)
|
||||||
|
from sqlalchemy.types import ARRAY
|
||||||
|
from sqlalchemy.engine import URL
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import Session, sessionmaker
|
||||||
|
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
class Ingredient(Base):
|
||||||
|
__tablename__ = "Ingredient"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String, nullable=False)
|
||||||
|
|
||||||
|
|
||||||
|
class RecipeSite(Base):
|
||||||
|
__tablename__ = "RecipeSite"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String, nullable=False, unique=True)
|
||||||
|
ingredient_class = Column(String, nullable=False)
|
||||||
|
name_class = Column(String, nullable=False)
|
||||||
|
base_url = Column(String, nullable=False, unique=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Recipe(Base):
|
||||||
|
__tablename__ = "Recipe"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String)
|
||||||
|
identifier = Column(String, nullable=False)
|
||||||
|
recipe_site_id = Column(Integer, ForeignKey("RecipeSite.id"))
|
||||||
|
UniqueConstraint(identifier, recipe_site_id)
|
||||||
|
|
||||||
|
|
||||||
|
class RecipeIngredient(Base):
|
||||||
|
__tablename__ = "RecipeIngredient"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
text = Column(String, nullable=False)
|
||||||
|
recipe_id = Column(Integer, ForeignKey("Recipe.id"))
|
||||||
|
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
|
||||||
|
|
||||||
|
|
||||||
|
class RecipeIngredientParts(Base):
|
||||||
|
__tablename__ = "RecipeIngredientParts"
|
||||||
|
|
||||||
|
id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
|
||||||
|
quantity = Column(String)
|
||||||
|
unit = Column(String)
|
||||||
|
instruction = Column(String)
|
||||||
|
ingredient = Column(String)
|
||||||
|
supplement = Column(String)
|
||||||
|
|
||||||
|
|
||||||
|
class IngredientConnection(Base):
|
||||||
|
__tablename__ = "IngredientConnection"
|
||||||
|
|
||||||
|
ingredient_a = Column(String, primary_key=True)
|
||||||
|
ingredient_b = Column(String, primary_key=True)
|
||||||
|
recipe_count = Column(Integer)
|
||||||
|
UniqueConstraint(ingredient_a, ingredient_b)
|
||||||
|
|
||||||
|
|
||||||
|
class RecipeConnection(Base):
|
||||||
|
__tablename__ = "RecipeConnection"
|
||||||
|
|
||||||
|
recipe_a = Column(Integer, ForeignKey("Recipe.id"), primary_key=True)
|
||||||
|
recipe_b = Column(Integer, ForeignKey("Recipe.id"), primary_key=True)
|
||||||
|
ingredient_count = Column(Integer)
|
||||||
|
|
||||||
|
|
||||||
|
class RecipeGraphed(Base):
|
||||||
|
__tablename__ = "RecipeGraphed"
|
||||||
|
|
||||||
|
recipe_id = Column(Integer, ForeignKey("Recipe.id"), primary_key=True)
|
||||||
|
status = Column(Boolean, nullable=False, default=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_engine(use_dotenv=True, **kargs):
|
||||||
|
if use_dotenv:
|
||||||
|
load_dotenv()
|
||||||
|
DB_URL = os.getenv("POSTGRES_URL")
|
||||||
|
DB_USER = os.getenv("POSTGRES_USER")
|
||||||
|
DB_PASSWORD = os.getenv("POSTGRES_PASSWORD")
|
||||||
|
DB_NAME = os.getenv("POSTGRES_DB")
|
||||||
|
|
||||||
|
eng_url = URL.create(
|
||||||
|
"postgresql",
|
||||||
|
username=DB_USER,
|
||||||
|
password=DB_PASSWORD,
|
||||||
|
host=DB_URL,
|
||||||
|
database=DB_NAME,
|
||||||
|
)
|
||||||
|
return create_engine(eng_url)
|
||||||
|
|
||||||
|
|
||||||
|
def get_session(**kargs) -> Session:
|
||||||
|
eng = get_engine(**kargs)
|
||||||
|
return sessionmaker(eng)
|
||||||
|
|
||||||
|
|
||||||
|
def create_tables(eng):
|
||||||
|
logging.info(f"Createing DB Tables: {eng.url}")
|
||||||
|
Base.metadata.create_all(eng, checkfirst=True)
|
||||||
|
|
||||||
|
|
||||||
|
def pair_query(pairable, groupable, recipe_ids=None, pair_type=String):
|
||||||
|
pair_func = func.text_pairs
|
||||||
|
if pair_type == Integer:
|
||||||
|
pair_func = func.int_pairs
|
||||||
|
|
||||||
|
new_pairs = select(
|
||||||
|
groupable,
|
||||||
|
pair_func(func.array_agg(pairable.distinct()), type_=ARRAY(pair_type)).label(
|
||||||
|
"pair"
|
||||||
|
),
|
||||||
|
).join(RecipeIngredientParts)
|
||||||
|
|
||||||
|
if not type(recipe_ids) == NoneType:
|
||||||
|
new_pairs = new_pairs.where(RecipeIngredient.recipe_id.in_(recipe_ids))
|
||||||
|
|
||||||
|
new_pairs = new_pairs.group_by(groupable).cte()
|
||||||
|
|
||||||
|
return new_pairs
|
||||||
|
|
||||||
|
|
||||||
|
def pair_count_query(pairs, countable, recipe_ids=None):
|
||||||
|
new_counts = select(pairs, func.count(func.distinct(countable)))
|
||||||
|
|
||||||
|
if not type(recipe_ids) == NoneType:
|
||||||
|
new_counts = new_counts.where(
|
||||||
|
or_(pairs[0].in_(recipe_ids), pairs[1].in_(recipe_ids))
|
||||||
|
)
|
||||||
|
|
||||||
|
new_counts = new_counts.group_by(pairs)
|
||||||
|
|
||||||
|
return new_counts
|
||||||
|
|
||||||
|
|
||||||
|
def update_graph_connectivity(session=None):
|
||||||
|
# this is pure SQLAlchemy so it is more portable
|
||||||
|
# This would have been simpler if I utilized Postgres specific feature
|
||||||
|
if not session:
|
||||||
|
session = Session(get_engine())
|
||||||
|
|
||||||
|
with session.begin():
|
||||||
|
ids = (
|
||||||
|
select(Recipe.id)
|
||||||
|
.join(RecipeGraphed, isouter=True)
|
||||||
|
.where(RecipeGraphed.status.is_not(True))
|
||||||
|
)
|
||||||
|
|
||||||
|
num_recipes = session.execute(
|
||||||
|
select(func.count("*")).select_from(ids.cte())
|
||||||
|
).fetchone()[0]
|
||||||
|
if num_recipes <= 0:
|
||||||
|
logging.info("no new recipies")
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info(f"adding {num_recipes} recipes to the graphs")
|
||||||
|
|
||||||
|
new_pairs = pair_query(
|
||||||
|
RecipeIngredientParts.ingredient, RecipeIngredient.recipe_id, recipe_ids=ids
|
||||||
|
)
|
||||||
|
|
||||||
|
new_counts = pair_count_query(new_pairs.c.pair, new_pairs.c.recipe_id)
|
||||||
|
|
||||||
|
logging.info("addeing new ingredient connections")
|
||||||
|
for pair, count in session.execute(new_counts):
|
||||||
|
connection = (
|
||||||
|
session.query(IngredientConnection)
|
||||||
|
.where(
|
||||||
|
and_(
|
||||||
|
IngredientConnection.ingredient_a == pair[0],
|
||||||
|
IngredientConnection.ingredient_b == pair[1],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if connection:
|
||||||
|
connection.recipe_count += count
|
||||||
|
session.merge(connection)
|
||||||
|
else:
|
||||||
|
session.add(
|
||||||
|
IngredientConnection(
|
||||||
|
ingredient_a=pair[0], ingredient_b=pair[1], recipe_count=count
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# update RecipeConnection
|
||||||
|
logging.info("adding new recipe connections")
|
||||||
|
all_pairs = pair_query(
|
||||||
|
RecipeIngredient.recipe_id,
|
||||||
|
RecipeIngredientParts.ingredient,
|
||||||
|
pair_type=Integer,
|
||||||
|
)
|
||||||
|
|
||||||
|
new_counts = pair_count_query(
|
||||||
|
all_pairs.c.pair, all_pairs.c.ingredient, recipe_ids=ids
|
||||||
|
)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for pair, count in session.execute(new_counts):
|
||||||
|
session.add(
|
||||||
|
RecipeConnection(
|
||||||
|
recipe_a=pair[0], recipe_b=pair[1], ingredient_count=count
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# flush often to reduce memory usage
|
||||||
|
i += 1
|
||||||
|
if (i % 100000) == 0:
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
# update RecipeGraphed.status
|
||||||
|
logging.info("updating existing RecipeGraphed rows")
|
||||||
|
for recipeGraphed in session.query(RecipeGraphed).where(
|
||||||
|
RecipeGraphed.recipe_id.in_(ids)
|
||||||
|
):
|
||||||
|
recipeGraphed.status = True
|
||||||
|
session.merge(recipeGraphed)
|
||||||
|
|
||||||
|
graphed = select(RecipeGraphed.recipe_id)
|
||||||
|
|
||||||
|
# add recipies that aren't in the table
|
||||||
|
logging.info("adding new RecipeGraphed rows")
|
||||||
|
for recipe in session.query(Recipe).where(
|
||||||
|
and_(Recipe.id.in_(ids), not_(Recipe.id.in_(graphed)))
|
||||||
|
):
|
||||||
|
session.add(RecipeGraphed(recipe_id=recipe.id, status=True))
|
||||||
|
|
||||||
|
|
||||||
|
def main(): # pragma: no cover
|
||||||
|
eng = get_engine()
|
||||||
|
create_tables(eng)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
from pydoc import apropos
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from recipe_graph import db
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def load_file(f_name: str) -> list[dict[str, any]]:
|
||||||
|
with open(f_name) as f:
|
||||||
|
sites = json.load(f)
|
||||||
|
return sites
|
||||||
|
|
||||||
|
|
||||||
|
def setup_argparser(args) -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Import recipes into database")
|
||||||
|
parser.add_argument("file", type=str, help="JSON file with recipe site information")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(args: argparse.Namespace) -> logging.Logger:
|
||||||
|
logger = logging.Logger("insert_sites", logging.WARNING)
|
||||||
|
if args.verbose:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
def add_sites(
|
||||||
|
S: Session,
|
||||||
|
sites: list[dict[str, any]],
|
||||||
|
logger: logging.Logger = None,
|
||||||
|
):
|
||||||
|
with S.begin() as session:
|
||||||
|
for site in sites:
|
||||||
|
if logger: # pragma: no cover
|
||||||
|
logger.info(f"Adding {site}")
|
||||||
|
session.add(db.RecipeSite(**site))
|
||||||
|
|
||||||
|
|
||||||
|
def main(): # pragma: no cover
|
||||||
|
args = setup_argparser(sys.argv[1:])
|
||||||
|
logger = setup_logging(args)
|
||||||
|
|
||||||
|
S = db.get_session()
|
||||||
|
sites = load_file(args.file)
|
||||||
|
|
||||||
|
add_sites(S, sites, logger)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,229 @@
|
||||||
|
import sys
|
||||||
|
from recipe_graph import db
|
||||||
|
import re
|
||||||
|
from sqlalchemy import select, desc, exists, not_, except_
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
import bs4
|
||||||
|
import requests as req
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import logging
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
|
||||||
|
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
|
||||||
|
ingredient_regex = "([a-zA-Z '\-]+)"
|
||||||
|
supplement_regex = ",?(.*)"
|
||||||
|
units_regex = "|".join(
|
||||||
|
[f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units]
|
||||||
|
)
|
||||||
|
units_regex = f"((?:(?:{units_regex})e?s?)?)"
|
||||||
|
instructions_regex = "|".join(
|
||||||
|
[f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions]
|
||||||
|
)
|
||||||
|
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
|
||||||
|
|
||||||
|
return re.compile(
|
||||||
|
number_regex
|
||||||
|
+ units_regex
|
||||||
|
+ instructions_regex
|
||||||
|
+ ingredient_regex
|
||||||
|
+ supplement_regex
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: load units and instructions from config.
|
||||||
|
# Moved data into optional parameters for the time being.
|
||||||
|
def parse_ingredient(
|
||||||
|
ingredient_text: str,
|
||||||
|
units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar",
|
||||||
|
"cup", "pinch", "container", "slice", "package",
|
||||||
|
"pound", "can", "dash", "spear", "bunch", "quart",
|
||||||
|
"cube", "envelope", "square", "sprig", "bag", "box",
|
||||||
|
"drop", "fluid ounce", "gallon", "head", "link",
|
||||||
|
"loaf", "pint", "pod", "sheet", "stalk", "whole",
|
||||||
|
"bar", "bottle", "bulb", "year", "fillet", "litter",
|
||||||
|
"packet", "slices"],
|
||||||
|
instructions: list[str] = [
|
||||||
|
"and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small",
|
||||||
|
"medium", "large", "couarse", "cracked", "crushed", "ground", "cooked",
|
||||||
|
"cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old",
|
||||||
|
"drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen",
|
||||||
|
"grated", "grilled", "hard", "hot", "juliened?", "leftover", "light",
|
||||||
|
"lite", "mashed", "melted", "minced", "packed", "peeled", "pitted",
|
||||||
|
"sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved",
|
||||||
|
"shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced",
|
||||||
|
"thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled",
|
||||||
|
"unopened", "unseasoned"],
|
||||||
|
):
|
||||||
|
regex = ingredient_regex(units, instructions)
|
||||||
|
|
||||||
|
m = regex.match(ingredient_text)
|
||||||
|
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return [text.strip() if text else None for text in m.groups()]
|
||||||
|
|
||||||
|
# this code is unused
|
||||||
|
# TODO: add tests when this is used
|
||||||
|
def missing_ingredients_query(session):
|
||||||
|
cte = (
|
||||||
|
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
|
||||||
|
).alias("missing")
|
||||||
|
missing = (
|
||||||
|
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
||||||
|
)
|
||||||
|
return missing
|
||||||
|
|
||||||
|
# this code is unused
|
||||||
|
# TODO: add tests when this is used
|
||||||
|
def parse_missing_ingredients(session):
|
||||||
|
missing = missing_ingredients_query(session)
|
||||||
|
for ingredient in missing:
|
||||||
|
parts = ingredient_to_parts(ingredient)
|
||||||
|
session.add(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
|
||||||
|
try:
|
||||||
|
logging.info(f"Loading Page: {recipe_url}")
|
||||||
|
with req.get(recipe_url) as resp:
|
||||||
|
if resp.status_code == 404:
|
||||||
|
raise Exception(f"Page does not exist (404): {recipe_url}")
|
||||||
|
return bs4.BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||||
|
logging.warning(e)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_recipe_name(
|
||||||
|
site: db.RecipeSite,
|
||||||
|
page: bs4.BeautifulSoup,
|
||||||
|
recipe: db.Recipe,
|
||||||
|
url: str = None,
|
||||||
|
) -> db.Recipe:
|
||||||
|
if not url:
|
||||||
|
url = {"site": site.base_url, "recipe": recipe.identifier}
|
||||||
|
name_candidates = page.find_all(class_=site.name_class)
|
||||||
|
if len(name_candidates) == 0:
|
||||||
|
raise Exception(f"Could not extract recipe name: {url}")
|
||||||
|
name_div = name_candidates[0]
|
||||||
|
recipe.name = name_div.text
|
||||||
|
|
||||||
|
logging.info(f"Adding Recipe {recipe.name} from {url}")
|
||||||
|
|
||||||
|
return recipe
|
||||||
|
|
||||||
|
def ingredient_to_parts(
|
||||||
|
ingredient: db.Ingredient
|
||||||
|
) -> db.RecipeIngredientParts:
|
||||||
|
parts = parse_ingredient(ingredient.text)
|
||||||
|
if parts:
|
||||||
|
quantity, unit, instruction, ingredient_name, supplement = parts
|
||||||
|
return db.RecipeIngredientParts(
|
||||||
|
id=ingredient.id,
|
||||||
|
quantity=quantity,
|
||||||
|
unit=unit,
|
||||||
|
instruction=instruction,
|
||||||
|
ingredient=ingredient_name,
|
||||||
|
supplement=supplement,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_recipe(session, recipe, site):
|
||||||
|
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
||||||
|
recipe_page = load_page(recipe_url)
|
||||||
|
if not recipe_page:
|
||||||
|
return None
|
||||||
|
|
||||||
|
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
|
||||||
|
session.add(recipe)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||||
|
for candidate in candidates:
|
||||||
|
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
||||||
|
session.add(ingredient)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
parts = ingredient_to_parts(ingredient)
|
||||||
|
if parts:
|
||||||
|
session.add(parts)
|
||||||
|
|
||||||
|
logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
|
||||||
|
|
||||||
|
return recipe
|
||||||
|
|
||||||
|
|
||||||
|
def main(): # pragma: no cover
|
||||||
|
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||||
|
parser.add_argument("site", help="Name of site")
|
||||||
|
parser.add_argument(
|
||||||
|
"-id",
|
||||||
|
"--identifier",
|
||||||
|
dest="id",
|
||||||
|
help="url of recipe(reletive to base url of site) or commma seperated list",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--auto",
|
||||||
|
action="store",
|
||||||
|
dest="n",
|
||||||
|
help="automaticaly generate identifier(must supply number of recipies to scrape)",
|
||||||
|
)
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
|
||||||
|
args = parser.parse_args(sys.argv)
|
||||||
|
if args.verbose:
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
|
||||||
|
|
||||||
|
eng = db.get_engine()
|
||||||
|
S = sessionmaker(eng)
|
||||||
|
|
||||||
|
with S.begin() as sess:
|
||||||
|
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
||||||
|
site_id = site.id
|
||||||
|
|
||||||
|
recipe_ids = []
|
||||||
|
starting_id = 0
|
||||||
|
if args.id and not args.n:
|
||||||
|
recipe_ids.append(args.id)
|
||||||
|
logging.info(f"Retreiving single recipe: {args.id}")
|
||||||
|
elif args.n:
|
||||||
|
if not args.id:
|
||||||
|
last_recipe = (
|
||||||
|
sess.query(db.Recipe)
|
||||||
|
.where(db.Recipe.recipe_site_id == site.id)
|
||||||
|
.order_by(desc(db.Recipe.identifier))
|
||||||
|
.limit(1)
|
||||||
|
.scalar()
|
||||||
|
)
|
||||||
|
starting_id = int(last_recipe.identifier) + 1
|
||||||
|
else:
|
||||||
|
starting_id = int(args.id)
|
||||||
|
recipe_ids = range(starting_id, starting_id + int(args.n))
|
||||||
|
logging.info(
|
||||||
|
f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
for recipe_id in recipe_ids:
|
||||||
|
try:
|
||||||
|
savepoint = sess.begin_nested()
|
||||||
|
|
||||||
|
recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id)
|
||||||
|
parse_recipe(sess, recipe, site)
|
||||||
|
|
||||||
|
savepoint.commit()
|
||||||
|
except KeyboardInterrupt as e:
|
||||||
|
savepoint.rollback()
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
savepoint.rollback()
|
||||||
|
logging.error(e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
190
src/scrape.py
190
src/scrape.py
|
|
@ -1,190 +0,0 @@
|
||||||
from ast import alias
|
|
||||||
from dis import Instruction
|
|
||||||
import db
|
|
||||||
import re
|
|
||||||
from sqlalchemy import select, desc, exists, not_, except_
|
|
||||||
from sqlalchemy.exc import IntegrityError
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
import bs4
|
|
||||||
from urllib.request import urlopen
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
import logging
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
|
|
||||||
def parse_ingredient(ingredient_text):
|
|
||||||
|
|
||||||
units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch',
|
|
||||||
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
|
|
||||||
'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag',
|
|
||||||
'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf',
|
|
||||||
'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb',
|
|
||||||
'year', 'fillet', 'litter', 'packet', 'slices']
|
|
||||||
instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
|
|
||||||
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
|
|
||||||
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
|
|
||||||
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
|
|
||||||
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
|
|
||||||
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
|
|
||||||
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
|
|
||||||
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
|
|
||||||
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
|
|
||||||
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
|
|
||||||
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
|
|
||||||
'unopened', 'unseasoned']
|
|
||||||
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
|
|
||||||
ingredient_regex = '([a-zA-Z \'\-]+)'
|
|
||||||
supplement_regex = ',?(.*)'
|
|
||||||
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
|
|
||||||
for unit in units])
|
|
||||||
units_regex = f"((?:(?:{units_regex})e?s?)?)"
|
|
||||||
instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}'
|
|
||||||
for inst in instructions])
|
|
||||||
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
|
|
||||||
|
|
||||||
regex = re.compile(number_regex +
|
|
||||||
units_regex +
|
|
||||||
instructions_regex +
|
|
||||||
ingredient_regex +
|
|
||||||
supplement_regex)
|
|
||||||
|
|
||||||
m = regex.match(ingredient_text)
|
|
||||||
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return [text.strip() if text else None for text in m.groups()]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def reparse_ingredients(session):
|
|
||||||
cte = (except_(select(db.RecipeIngredient.id),
|
|
||||||
select(db.RecipeIngredientParts.id))).\
|
|
||||||
alias('missing')
|
|
||||||
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
|
||||||
|
|
||||||
for ingredient in missing:
|
|
||||||
parts = parse_ingredient(ingredient.text)
|
|
||||||
if not parts:
|
|
||||||
continue
|
|
||||||
quantity, unit, instruction, name, supplement = parts
|
|
||||||
session.add(db.RecipeIngredientParts(id = ingredient.id,
|
|
||||||
quantity = quantity,
|
|
||||||
unit = unit,
|
|
||||||
instruction = instruction,
|
|
||||||
ingredient = name,
|
|
||||||
supplement = supplement))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_recipe(recipe_url):
|
|
||||||
try:
|
|
||||||
logging.info(f'Loading Recipe: {recipe_url}')
|
|
||||||
with urlopen(recipe_url) as f:
|
|
||||||
if f.getcode() == 404:
|
|
||||||
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
|
||||||
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
|
||||||
logging.warning(e)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse_recipe(session, recipe, site):
|
|
||||||
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
|
||||||
recipe_page = load_recipe(recipe_url)
|
|
||||||
if not recipe_page:
|
|
||||||
return None
|
|
||||||
|
|
||||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
|
||||||
if len(name_candidates) == 0:
|
|
||||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
|
||||||
name_div = name_candidates[0]
|
|
||||||
recipe.name = name_div.text
|
|
||||||
|
|
||||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
|
||||||
|
|
||||||
session.add(recipe)
|
|
||||||
session.flush()
|
|
||||||
|
|
||||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
|
||||||
for candidate in ingred_candidates:
|
|
||||||
ingred = db.RecipeIngredient(text=candidate.text,
|
|
||||||
recipe_id=recipe.id)
|
|
||||||
session.add(ingred)
|
|
||||||
session.flush()
|
|
||||||
|
|
||||||
parts = parse_ingredient(ingred.text)
|
|
||||||
if parts:
|
|
||||||
quantity, unit, instruction,ingredient, supplement = parts
|
|
||||||
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
|
|
||||||
quantity = quantity,
|
|
||||||
unit = unit,
|
|
||||||
instruction = instruction,
|
|
||||||
ingredient = ingredient,
|
|
||||||
supplement = supplement)
|
|
||||||
session.add(ingred_parts)
|
|
||||||
|
|
||||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
|
||||||
|
|
||||||
return recipe
|
|
||||||
|
|
||||||
|
|
||||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
|
||||||
parser.add_argument('site',
|
|
||||||
help='Name of site')
|
|
||||||
parser.add_argument('-id', '--identifier', dest='id',
|
|
||||||
help='url of recipe(reletive to base url of site) or commma seperated list')
|
|
||||||
parser.add_argument('-a', '--auto', action='store', dest='n',
|
|
||||||
help='automaticaly generate identifier(must supply number of recipies to scrape)')
|
|
||||||
parser.add_argument('-v', '--verbose', action='store_true')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
if args.verbose:
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
|
||||||
|
|
||||||
eng = db.get_engine()
|
|
||||||
S = sessionmaker(eng)
|
|
||||||
|
|
||||||
with S.begin() as sess:
|
|
||||||
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
|
||||||
site_id = site.id
|
|
||||||
|
|
||||||
recipe_ids = []
|
|
||||||
starting_id = 0
|
|
||||||
if args.id and not args.n:
|
|
||||||
recipe_ids.append(args.id)
|
|
||||||
logging.info(f'Retreiving single recipe: {args.id}')
|
|
||||||
elif args.n:
|
|
||||||
if not args.id:
|
|
||||||
last_recipe = sess.query(db.Recipe).\
|
|
||||||
where(db.Recipe.recipe_site_id == site.id).\
|
|
||||||
order_by(desc(db.Recipe.identifier)).\
|
|
||||||
limit(1).\
|
|
||||||
scalar()
|
|
||||||
starting_id = int(last_recipe.identifier) + 1
|
|
||||||
else:
|
|
||||||
starting_id = int(args.id)
|
|
||||||
recipe_ids = range(starting_id, starting_id+int(args.n))
|
|
||||||
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for recipe_id in recipe_ids:
|
|
||||||
try:
|
|
||||||
savepoint = sess.begin_nested()
|
|
||||||
|
|
||||||
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
|
||||||
parse_recipe(sess, recipe, site)
|
|
||||||
|
|
||||||
savepoint.commit()
|
|
||||||
except KeyboardInterrupt as e:
|
|
||||||
savepoint.rollback()
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
savepoint.rollback()
|
|
||||||
logging.error(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
import inspect
|
||||||
|
from recipe_graph import db
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
import sqlalchemy
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def engine() -> sqlalchemy.engine.Engine:
|
||||||
|
engine = db.get_engine()
|
||||||
|
# make sure db is empty otherwise might be testing a live db
|
||||||
|
# this makes sure that we don't drop all on a live db
|
||||||
|
inspector = sqlalchemy.inspect(engine)
|
||||||
|
schemas = inspector.get_schema_names()
|
||||||
|
assert len(list(schemas)) == 2
|
||||||
|
assert schemas[0] == "information_schema"
|
||||||
|
assert schemas[1] == "public"
|
||||||
|
|
||||||
|
db_tables = inspector.get_table_names(schema="public")
|
||||||
|
assert len(db_tables) == 0
|
||||||
|
|
||||||
|
yield engine
|
||||||
|
db.Base.metadata.drop_all(engine)
|
||||||
|
|
||||||
|
|
||||||
|
def init_db(engine) -> sqlalchemy.engine.Engine:
|
||||||
|
db.create_tables(engine)
|
||||||
|
return engine
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tables() -> list[db.Base]:
|
||||||
|
tables = []
|
||||||
|
for _, obj in inspect.getmembers(db):
|
||||||
|
if inspect.isclass(obj) and issubclass(obj, db.Base) and obj != db.Base:
|
||||||
|
tables.append(obj)
|
||||||
|
return tables
|
||||||
|
|
||||||
|
|
||||||
|
def test_db_connection(engine):
|
||||||
|
connected = False
|
||||||
|
try:
|
||||||
|
engine.connect()
|
||||||
|
connected = True
|
||||||
|
except (SQLAlchemyError):
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
assert connected == True
|
||||||
|
|
||||||
|
def test_get_session():
|
||||||
|
session = db.get_session()
|
||||||
|
eng = db.get_engine()
|
||||||
|
session == sessionmaker(eng)
|
||||||
|
|
||||||
|
|
||||||
|
def test_db_classes(tables):
|
||||||
|
assert len(tables) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_db_class_creation(tables: list[db.Base], engine: sqlalchemy.engine.Engine):
|
||||||
|
db.create_tables(engine)
|
||||||
|
inspector = sqlalchemy.inspect(engine)
|
||||||
|
db_tables = inspector.get_table_names(schema="public")
|
||||||
|
assert len(db_tables) == len(tables)
|
||||||
|
table_names = [table.__tablename__ for table in tables]
|
||||||
|
assert sorted(db_tables) == sorted(table_names)
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from recipe_graph import insert_sites, db
|
||||||
|
from sqlalchemy import select
|
||||||
|
import sqlalchemy
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from test_db import engine, init_db
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def json_data() -> list[dict[str, any]]:
|
||||||
|
return [{"key": "value"}, {"test": "value1", "test2": "value2"}]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_initialized(engine) -> sqlalchemy.engine.Engine:
|
||||||
|
init_db(engine)
|
||||||
|
return engine
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_sites() -> list[dict[str, any]]:
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"name": "example-site",
|
||||||
|
"ingredient_class": "example-item-name",
|
||||||
|
"name_class": "example-content",
|
||||||
|
"base_url": "https://www.example.com/recipe/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "test-site",
|
||||||
|
"ingredient_class": "test-item-name",
|
||||||
|
"name_class": "test-content",
|
||||||
|
"base_url": "https://www.test.com/recipe/",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def json_file(json_data: list[dict]) -> str:
|
||||||
|
f_path = "test.json"
|
||||||
|
with open(f_path, "w") as f:
|
||||||
|
json.dump(json_data, f)
|
||||||
|
yield f_path
|
||||||
|
if os.path.exists(f_path):
|
||||||
|
os.remove(f_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_file(json_file: str, json_data):
|
||||||
|
test_data = insert_sites.load_file(json_file)
|
||||||
|
assert test_data == json_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_setup_argparser():
|
||||||
|
file_name = "test"
|
||||||
|
args = insert_sites.setup_argparser([file_name])
|
||||||
|
assert len(vars(args)) == 2
|
||||||
|
assert args.file == file_name
|
||||||
|
assert args.verbose == False
|
||||||
|
|
||||||
|
args = insert_sites.setup_argparser([file_name, "-v"])
|
||||||
|
assert args.file == file_name
|
||||||
|
assert args.verbose == True
|
||||||
|
|
||||||
|
args = insert_sites.setup_argparser([file_name, "--verbose"])
|
||||||
|
assert args.file == file_name
|
||||||
|
assert args.verbose == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_setup_logging():
|
||||||
|
args = insert_sites.setup_argparser(["test"])
|
||||||
|
logger = insert_sites.setup_logging(args)
|
||||||
|
assert logger.level == logging.WARNING
|
||||||
|
|
||||||
|
args = insert_sites.setup_argparser(["test", "-v"])
|
||||||
|
logger = insert_sites.setup_logging(args)
|
||||||
|
assert logger.level == logging.INFO
|
||||||
|
|
||||||
|
args = insert_sites.setup_argparser(["test", "--verbose"])
|
||||||
|
logger = insert_sites.setup_logging(args)
|
||||||
|
assert logger.level == logging.INFO
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_sites(mock_sites, db_initialized):
|
||||||
|
db_session = db.get_session()
|
||||||
|
insert_sites.add_sites(db_session, mock_sites)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
with db_session.begin() as session:
|
||||||
|
results = session.execute(select(db.RecipeSite)).all()
|
||||||
|
|
||||||
|
assert len(results) > 0
|
||||||
|
assert len(results) == 2
|
||||||
|
print(db.RecipeSite(name="a"))
|
||||||
|
for i, (site,) in enumerate(results):
|
||||||
|
site.name == mock_sites[i]["name"]
|
||||||
|
site.ingredient_class == mock_sites[i]["ingredient_class"]
|
||||||
|
site.name_class == mock_sites[i]["name_class"]
|
||||||
|
site.base_url == mock_sites[i]["base_url"]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,115 @@
|
||||||
|
from recipe_graph import scrape
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
|
||||||
|
|
||||||
|
from pytest import fixture
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_site():
|
||||||
|
return RecipeSite(
|
||||||
|
name="mock-site",
|
||||||
|
ingredient_class="mock-ing",
|
||||||
|
name_class="mock-name",
|
||||||
|
base_url="example-site/mock-site",
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: should probably load HTML from file
|
||||||
|
@fixture
|
||||||
|
def mock_page():
|
||||||
|
return BeautifulSoup(
|
||||||
|
"""
|
||||||
|
<header></header><body>
|
||||||
|
<div class="mock-name">test_recipe</div>
|
||||||
|
<div class="mock-ing">test_ingredient</div>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
"html.parser",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_blank_page():
|
||||||
|
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_recipe():
|
||||||
|
return Recipe(name="test_recipe", identifier="mock_1")
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_ingredient():
|
||||||
|
return RecipeIngredient(text="1 ounce water")
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_url():
|
||||||
|
return "example-site/mock-site"
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_page():
|
||||||
|
page = scrape.load_page("https://www.google.com")
|
||||||
|
assert type(page) == BeautifulSoup
|
||||||
|
|
||||||
|
page = scrape.load_page("https://www.google.com/some-nonsense")
|
||||||
|
assert page == None
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingredient_regex():
|
||||||
|
regex = scrape.ingredient_regex(["cup"], ["crushed"])
|
||||||
|
assert (
|
||||||
|
regex.pattern
|
||||||
|
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||||
|
)
|
||||||
|
regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"])
|
||||||
|
assert (
|
||||||
|
regex.pattern
|
||||||
|
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_ingredient(mock_ingredient):
|
||||||
|
parts = scrape.parse_ingredient(mock_ingredient.text)
|
||||||
|
assert len(parts) > 0
|
||||||
|
assert parts == ['1', 'ounce', '', 'water', None]
|
||||||
|
|
||||||
|
parts = scrape.parse_ingredient("Water")
|
||||||
|
assert len(parts) > 0
|
||||||
|
assert parts == [None, None, None, 'Water', None]
|
||||||
|
|
||||||
|
parts = scrape.parse_ingredient("")
|
||||||
|
assert parts == None
|
||||||
|
|
||||||
|
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
|
||||||
|
expected_name = mock_recipe.name
|
||||||
|
mock_recipe.name = None
|
||||||
|
|
||||||
|
mock_recipe = scrape.parse_recipe_name(
|
||||||
|
mock_site,
|
||||||
|
mock_page,
|
||||||
|
mock_recipe,
|
||||||
|
)
|
||||||
|
assert mock_recipe.name == expected_name
|
||||||
|
|
||||||
|
ex = None
|
||||||
|
try:
|
||||||
|
mock_recipe = scrape.parse_recipe_name(
|
||||||
|
mock_site,
|
||||||
|
mock_blank_page,
|
||||||
|
mock_recipe,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ex = e
|
||||||
|
url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
|
||||||
|
assert str(e) == f"Could not extract recipe name: {url}"
|
||||||
|
assert ex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingredient_to_parts(mock_ingredient):
|
||||||
|
parts = scrape.ingredient_to_parts(mock_ingredient)
|
||||||
|
assert parts.quantity == "1"
|
||||||
|
assert parts.unit == "ounce"
|
||||||
|
assert parts.instruction == ""
|
||||||
|
assert parts.ingredient == "water"
|
||||||
|
assert parts.supplement == None
|
||||||
|
|
||||||
Loading…
Reference in New Issue