From 339195fda694e5dee7b2f8a827e32b722e54434b Mon Sep 17 00:00:00 2001 From: Andrei Stoica Date: Thu, 18 May 2023 08:57:33 -0400 Subject: [PATCH] moved from urllib to requests --- pyproject.toml | 3 ++- requirements.txt | 1 + src/recipe_graph/scrape.py | 17 ++++++++--------- test/test_scrape.py | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a037f4..97eb16d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,5 +10,6 @@ dependencies = [ "SQLAlchemy==1.4.39", "python-dotenv==0.20.0", "beautifulsoup4==4.11.1", - "psycopg2-binary==2.9.3" + "psycopg2-binary==2.9.3", + "requests~=2.30.0" ] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9bce4cb..85b77c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ pyparsing==3.0.9 pytest==7.1.3 pytest-cov==4.0.0 python-dotenv==0.20.0 +requests~=2.30.0 soupsieve==2.3.2.post1 SQLAlchemy==1.4.39 tomli==2.0.1 diff --git a/src/recipe_graph/scrape.py b/src/recipe_graph/scrape.py index fd58e82..42100da 100644 --- a/src/recipe_graph/scrape.py +++ b/src/recipe_graph/scrape.py @@ -4,7 +4,7 @@ import re from sqlalchemy import select, desc, exists, not_, except_ from sqlalchemy.orm import sessionmaker import bs4 -from urllib.request import urlopen +import requests as req from urllib.parse import urljoin import logging from argparse import ArgumentParser @@ -73,23 +73,22 @@ def reparse_ingredients(session): -def load_recipe(recipe_url): +def load_page(recipe_url): try: - logging.info(f'Loading Recipe: {recipe_url}') - with urlopen(recipe_url) as f: - if f.getcode() == 404: - raise Exception(f"Recipe Does not exist: {recipe_url}") + logging.info(f'Loading Page: {recipe_url}') + with req.get(recipe_url) as f: + if f.status_code == 404: + raise Exception(f"Page does not exist (404): {recipe_url}") return bs4.BeautifulSoup(f.read().decode(), 'html.parser') except Exception as e: logging.warning(f"Could not download or parse recipe: {recipe_url}") logging.warning(e) - return None def parse_recipe(session, recipe, site): recipe_url = urljoin(site.base_url, str(recipe.identifier)) - recipe_page = load_recipe(recipe_url) + recipe_page = load_page(recipe_url) if not recipe_page: return None @@ -126,7 +125,7 @@ def parse_recipe(session, recipe, site): return recipe -def main(): +def main(): # pragma: no cover parser = ArgumentParser(description="Scrape a recipe site for recipies") parser.add_argument('site', help='Name of site') diff --git a/test/test_scrape.py b/test/test_scrape.py index eaa84ac..905b56d 100644 --- a/test/test_scrape.py +++ b/test/test_scrape.py @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup import pytest -def test_load_recipe(): +def test_load_page(): page = scrape.load_recipe("https://hs.andreistoica.ca:4943") assert type(page) == BeautifulSoup