Saving spider results to database - python

Currently thinking about a good way to save my scraped data into a database.
App flow:
Run spider (data scraper), file located in spiders/
When data has been collected successfully save the data/items (title, link, pubDate) to the database by use of the class in pipeline.py
I would like your help with on how to save the scraped data (title, link, pubDate) from spider.py into the database through pipeline.py, currently I have nothing connecting these files together. When the data has been successfully scraped pipelines needs to be notified, receive the data and save it
I'm very thankful for your help
Spider.py
import urllib.request
import lxml.etree as ET
opener = urllib.request.build_opener()
tree = ET.parse(opener.open('https://nordfront.se/feed'))
items = [{'title': item.find('title').text, 'link': item.find('link').text, 'pubdate': item.find('pubDate').text} for item in tree.xpath("/rss/channel/item")]
for item in items:
print(item['title'], item['link'], item['pubdate'])
Models.py
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL
from sqlalchemy import UniqueConstraint
import datetime
import settings
def db_connect():
"""
Performs database connection using database settings from settings.py.
Returns sqlalchemy engine instance
"""
return create_engine(URL(**settings.DATABASE))
DeclarativeBase = declarative_base()
# <--snip-->
def create_presstv_table(engine):
DeclarativeBase.metadata.create_all(engine)
def create_nordfront_table(engine):
DeclarativeBase.metadata.create_all(engine)
def _get_date():
return datetime.datetime.now()
class Nordfront(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "nordfront"
id = Column(Integer, primary_key=True)
title = Column('title', String)
description = Column('description', String, nullable=True)
link = Column('link', String, unique=True)
date = Column('date', String, nullable=True)
created_at = Column('created_at', DateTime, default=_get_date)
Pipeline.py
from sqlalchemy.orm import sessionmaker
from models import Nordfront, db_connect, create_nordfront_table
class NordfrontPipeline(object):
"""Pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_nordfront_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save data in the database.
This method is called for every item pipeline component.
"""
session = self.Session()
deal = Nordfront(**item)
if session.query(Nordfront).filter_by(link=item['link']).first() == None:
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
Settings.py
DATABASE = {'drivername': 'postgres',
'host': 'localhost',
'port': '5432',
'username': 'toothfairy',
'password': 'password123',
'database': 'news'}

As far as I understand, this is a Scrapy-specific question. If so, you just need to activate your pipeline in settings.py:
ITEM_PIPELINES = {
'myproj.pipeline.NordfrontPipeline': 100
}
This would let the engine know to send the crawled items to the pipeline (see control flow):
If we are not talking about Scrapy, then, call process_item() directly from your spider:
from pipeline import NordfrontPipeline
...
pipeline = NordfrontPipeline()
for item in items:
pipeline.process_item(item, None)
You may also remove the spider argument from the process_item() pipeline method since it is not used.

Related

Python/Flask/SQLAlchemy data in instance list persisting after page reloads

Project structure
app/
-- entrypoint.py
-- database.py
-- models/
---- person.py
---- filtering_class.py
-- pages/
---- routes.py
Files included in the issues:
entrypoint.py
from flask import Flask
import models
app = Flask(__name__, instance_relative_config=True)
database.py
from flask import _app_ctx_stack
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, scoped_session
SQLALCHEMY_DATABASE_URI = 'sqlite:///persons.db'
SQLALCHEMY_TRACK_MODIFICATIONS = False
engine = create_engine(
SQLALCHEMY_DATABASE_URI,
connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine
)
session = scoped_session(
SessionLocal,
scopefunc=_app_ctx_stack.__ident_func__
)
Base = declarative_base()
routes.py
from flask import render_template, request
from entrypoint import app
from database import session
from models import Person
#app.route('/person')
def person():
response_id = request.args.get('id')
person = session.query(Person).filter_by(id=response_id).first()
person = Person.get_others(person=person)
return render_template('person.html',
person=person)
person.py
from database import Base, session
from models import FilteringClass
class Person(Base):
__tablename__ = 'persons'
id = Column(Integer, primary_key=True)
first_name = Column(String(80), nullable=True)
last_name = Column(String(80), nullable=True)
#staticmethod
def get_persons(filtering_class=None):
query = session.query(Person.id,
Person.first_name,
Person.last_name)
if filtering_class:
query = filter_class.apply_filters()
results = query.all()
#staticmethod
def get_others(person):
# Here lies the problem
filtering_class = FilteringClass()
# After first call, len(filtering_class.custom_expressions) == 0 (expected), after second call len(...) already == 1, etc
filtering_class.add_custom_expression(Person.id != person.id)
return Person.get_persons(filtering_class=filtering_class)
filtering_class.py
class FilteringClass(object):
def __init__(self,
custom_expressions=[]):
self.custom_expressions = custom_expressions
def apply_filters(self, query):
# Import Person here to avoid circular import issues in person.py
from models.person import Person
if self.custom_expressions:
for exp in self.custom_expressions:
query = query.filter(exp)
return query
def add_custom_expression(self, expression):
self.custom_expressions.append(expression)
Description
FilteringClass is used to filter a passed query argument. It has a method for users of the class to add their own BinaryExpressions to be applied when FilteringClass.apply_filters() is called.
The goal here is the retrieve all Persons that are not the same as the person who initiated the page request by using the FilteringClass to exclude Person objects with the same ID.
The problem
The expected behavior would be that on each request, a new FilteringClass is instantiated (see Person.get_others --> filtering_class = FilteringClass()).
At that point it is expected that the internal custom_expressions array inside filtering_class instance would be empty, as defined in its constructor.
However, everytime the page related to the /person route is reloaded and the instance of filtering_class is created, its custom_expressions array is already populated with the previously added custom expression. Meaning that on every page reload, filtering_class.custom_expressions will grow without ever coming back to an empty state.
What I tried
Reseting the custom_expressions to empty with self.custom_expressions = [] directly after filtering the passed query.
Calling session.close() right before returning the page template in /person endpoint.
Calling session.commit() right before returning the page template in /person endpoint (but I don't think I should have to commit anything for a SELECT statement anyways).
Sorry for the long post, I tried to include everything that could be useful (but please let me know if I should add anything).
UPDATE: Solution:
As per #larsks ' comment, the issue was not related to SQLAlchemy but to a Python gotcha with default mutable arguments (see: https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments).
All there was to do to fix the issue was:
Changing FilteringClass' constructor from:
def __init__(self,
custom_expressions=[]):
self.custom_expressions = custom_expressions
to:
def __init__(self,
custom_expressions=None):
self.custom_expressions = custom_expressions
Change the add_custom_expression method from:
def add_custom_expression(self, expression):
self.custom_expressions.append(expression)
to:
def add_custom_expression(self, expression):
if self.custom_expressions is None:
self.custom_expressions = []
self.custom_expressions.append(expression)

Flask-Restx not converting enum field type to JSON

I need help with Enum field type as it is not accepted by Swagger and I am getting error message **TypeError: Object or Type eGameLevel is not JSON serializable**. Below is the complete set of code for table. Complete set of code with DB table and sqlalchemy settings is provided. I already tried it with Marshmallow-Enum Flask package and it didn't worked. Looking for kind help with some explanation about the solution so I can learn it well. :-)
I am using MySQL with the Flask. In Postgres its pretty easy to manage all the choice fields. All I need is a working example or a link to repository where MySQL choice fields are showing up in swagger drop down.
My Model:
import enum
from app import db
from typing import List
class eGameLevel(enum.Enum):
BEGINNER = 'Beginner'
ADVANCED = 'Advanced'
class Game(Base):
__tablename__ = 'game_stage'
id = db.Column(db.Integer(), primary_key=True)
game_level= db.Column(db.Enum(eGameLevel),
default=eGameLevel.BEGINNER, nullable=False)
user_id = db.Column(db.Integer(), db.ForeignKey('users.id', ondelete='CASCADE'), nullable=False)
user = db.relationship('User', backref='game__level_submissions', lazy=True)
def __init__(self, game_level, user_id):
self.game_level = game_level
self.user_id = user_id
def __repr__(self):
return 'Game(game_level%s, ' \
'user_id%s'%(self.game_level,
self.user_id)
def json(self):
return {'game_level':self.game_level,
'user_id':self.user_id}
#classmethod
def by_game_id(cls, _id):
return cls.query.filter_by(id=_id)
#classmethod
def find_by_game_level(cls, game_level):
return cls.query.filter_by(game_level=game_level)
#classmethod
def by_user_id(cls, _user_id):
return cls.query.filter_by(user_id=_user_id)
#classmethod
def find_all(cls) -> List["Game"]:
return cls.query.all()
def save_to_db(self) -> None:
db.session.add(self)
db.session.commit()
def delete_from_db(self) -> None:
db.session.delete(self)
db.session.commit()
My Schema
from app import ma
from app.models import Gode
class GameSchema(ma.SQLAlchemyAutoSchema):
game = ma.Nested('GameSchema', many=True)
class Meta:
model = Game
load_instance = True
include_fk= True
My Resources:
from flask_restx import Resource, fields, Namespace
from app.models import Game
from app import db
from app.schemas import GameSchema
GAME_REQUEST_NOT_FOUND = "Game request not found."
GAME_REQUEST_ALREADY_EXSISTS = "Game request '{}' Already exists."
game_ns = Namespace('Game', description='Available Game Requests')
games_ns = Namespace('Game Requests', description='All Games Requests')
game_schema = GameSchema()
games_list_schema = GameSchema(many=True)
gamerequest = game_ns.model('Game', {
'game_level': fields.String('Game Level: Must be one of: BEGINNER, ADVANCED.'),
'user_id': fields.Integer,
})
class GameRequestsListAPI(Resource):
#games_ns.doc('Get all Game requests.')
def get(self):
return games_list_schema.dump(Game.find_all()), 200
#games_ns.expect(gamerequest)
#games_ns.doc("Create a Game request.")
def post(self):
game_json = request.get_json()
game_data = game_schema.load(game_json)
game_data.save_to_db()
return game_schema.dump(game_data), 201
Instead of trying to manage Enum fields for MySQL schema I suggest to use another table with backref to your eGameLevel. You can get rid of this whole fuss and also in future if you needed to add another option in your choice field you won't have to hardcode it.
Simply create a main table as Game and sub table as eGameLevel (with only one string field). You will be able to access choices from your Game table.
Whenever I get stuck I go to basics as mentioned in here.
I made a small example just to test the serialization of an Enum
from enum import Enum
import sqlalchemy as sa
from flask import Flask
from flask_restx import Api, Namespace, Resource
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
class eGameLevel(str, Enum):
BEGINNER = "Beginner"
ADVANCED = "Advanced"
engine = sa.create_engine("sqlite:///:memory:")
session = scoped_session(sessionmaker(bind=engine))
Base = declarative_base()
class Game(Base):
__tablename__ = "game"
id = sa.Column(sa.Integer, primary_key=True, autoincrement=True)
level = sa.Column(sa.Enum(eGameLevel), default=eGameLevel.BEGINNER, nullable=False)
def __repr__(self):
return f"Game(id={self.id}, level={self.level})"
def json(self):
data = {"id": self.id, "level": self.level}
return data
Base.metadata.create_all(engine)
g1 = Game(level=eGameLevel.BEGINNER)
g2 = Game(level=eGameLevel.ADVANCED)
session.add_all([g1, g2])
session.commit()
query_content = session.query(Game).all()
games_ns = Namespace("Game Requests", description="All Games Requests")
app = Flask(__name__)
api = Api(app)
#api.route("/game")
class GameRequestsListAPI(Resource):
#games_ns.doc("Get all Game requests.")
def get(self):
data = [x.json() for x in query_content]
return data, 200
app.run(debug=True)
This example works and I think the serialization is possible due to the str in the Enum declaration: class eGameLevel(str, Enum).
Instead of using Enum:
class eGameLevel(enum.Enum):
BEGINNER = 'Beginner'
ADVANCED = 'Advanced'
You can make use of dictionary:
eGameLevel = {"BEGINNER": 1, "ADVANCED": 2}
Then you can replace enum type for sql data model to String type as:
game_level= db.Column(db.Integer(),
default=eGameLevel["BEGINNER"], nullable=False)
And make appropriate checks using the defined dictionary throughout application. This will also solve issues with alembic as well for making db migrations.
You would also require modifying some of your python files. I would rather do it here directly, and then you can look up to modify them:
#Import at Resources
from flask import request
from app.models import Game, eGameLevel
Post Part:
# For post part
payload = request.json
game_obj = Game(game_level=eGameLevel[payload["game_level"]], user_id=payload["user_id"])
db.session.add(game_obj)
db.session.commit()
Furthermore, I did not understand what the from app.models import Gode meant.

Instantiating object automatically adds to SQLAlchemy Session. Why?

From my understanding of SQLAlchemy, in order to add a model to a session, I need to call session.add(obj). However, for some reason, in my code, SQLAlchemy seems to do this automatically.
Why is it doing this, and how can I stop it? Am I approaching session in the correct way?
example
>>> from database import Session as db
>>> import clients
>>> from instances import Instance
>>> from uuid import uuid4
>>> len(db.query(Instance).all())
>>> 0 # Note, no instances in database/session
>>> i = Instance(str(uuid4()), clients.get_by_code('AAA001'), [str(uuid4())])
>>> len(db.query(Instance).all())
>>> 1 # Why?? I never called db.add(i)!
database.py
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
import config
Base = declarative_base()
class Database():
def __init__(self):
db_url = 'postgresql://{:s}:{:s}#{:s}:{}/{:s}'.format(
config.database['user'],
config.database['password'],
config.database['host'],
config.database['port'],
config.database['dbname']
)
self.engine = create_engine(db_url)
session_factory = sessionmaker(bind=self.engine)
self.session = scoped_session(session_factory)
Database = Database()
Session = Database.session
instance.py
from sqlalchemy import Column, Text, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import UUID, ARRAY
import database
Base = database.Base
class Instance(Base):
__tablename__ = 'instances'
uuid = Column(UUID, primary_key=True)
client_code = Column(
Text, ForeignKey('clients.code', ondelete='CASCADE'), nullable=False)
mac_addresses = Column(ARRAY(Text, as_tuple=True),
primary_key=True)
client = relationship("Client", back_populates="instances")
def __init__(self, uuid, client, mac_addresses):
self.uuid = uuid
self.client = client
self.mac_addresses = tuple(mac_addresses)
client.py
from sqlalchemy import Column, Text
from sqlalchemy.orm import relationship
import database
from database import Session as db
Base = database.Base
class Client(Base):
__tablename__ = 'clients'
code = Column(Text, primary_key=True)
name = Column(Text)
instances = relationship("Instance", back_populates='client')
def __init__(self, code, name=None):
self.code = code
self.name = name
def get_by_code(code):
client = db.query(Client).filter(Client.code == code).first()
return client
When you create a SQLAlchemy object and link it directly to another SQLAlchemy object, both objects end up in the session.
The reason is that SQLAlchemy needs to make sure you can query these objects.
Take, for example, a user with addresses.
If you create a user in code, with an address, both the user and the address end up in the session, because the address is linked to the user and SQLAlchemy needs to make sure you can query all addresses of a user using user.addresses.all().
In that case all (possibly) existing addresses need to be fetched, as well as the new address you just added. For that purpose the newly added address needs to be saved in the database.
To prevent this from happening (for example if you only need objects to just calculate with), you can link the objects with their IDs/Foreign Keys:
address.user_id = user.user_id
However, if you do this, you won't be able to access the SQLAlchemy properties anymore. So user.addresses or address.user will no longer yield results.
The reverse is also true; I asked a question myself a while back why linking two objects by ID will not result in SQLAlchemy linking these objects in the ORM:
relevant stackoverflow question
another description of this behavior

Scrapy and DuplicatesPipeline avoid saving duplicates to db

Currently my Scrapy library based spider is scraping a url (this url updates every minute with new items) and saving news list items to a database, the list is updated every hour and I am trying to avoid adding duplicates of these news items, through the use of "class DuplicatesPipeline(object):" in my pipelines.py
Currently my script is saving news items into the db, however it still saves duplicates.
Probably class DuplicatesPipeline is the wrong way to go since it does not seem to check against existing records in the database, it only checks against duplicates in current session.
Very thankful for your help
Model:
class Listitem(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "newsitems"
id = Column(Integer, primary_key=True)
title = Column('title', String)
description = Column('description', String, nullable=True)
link = Column('link', String, nullable=True)
date = Column('date', String, nullable=True)
Pipelines.py:
from sqlalchemy.orm import sessionmaker
from models import Presstv, db_connect, create_presstv_table
from scrapy import signals
from scrapy.exceptions import DropItem
class PressTvPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_presstv_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save deals in the database.
This method is called for every item pipeline component.
"""
session = self.Session()
deal = Presstv(**item)
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
I think you should:
add a UNIQUE constraint on link in your database
check something like db.query(Listitem).filter_by(link=item.link).first() is None in the pipeline
Your mechanism may be used as an optimization - if you find a copy in your instance cache, there is not need to run the query. But if there is no copy, run the query.

Loading heroku pg database with scraped data from scrapy spider.\

I am news to heroku pg. What I am doing here is I have written a scrapy crawler which runs without any errors. Problem is I want to put all the scraped data into my heroku postgres database. For doing that I somewhat followed this tutorial
When I run crawler on my local machne using scrapy crawl spidername it runs successfull but scraped data is not inserted neither any table is created on heroku database. I dont even get any errors on local terminal. this is what my code is...
settings.py
BOT_NAME = 'crawlerconnectdatabase'
SPIDER_MODULES = ['crawlerconnectdatabase.spiders']
NEWSPIDER_MODULE = 'crawlerconnectdatabase.spiders'
DATABASE = {'drivername': 'postgres',
'host': 'ec2-54-235-250-41.compute-1.amazonaws.com',
'port': '5432',
'username': 'dtxwjcycsaweyu',
'password': '***',
'database': 'ddcir2p1u2vk07'}
items.py
from scrapy.item import Item, Field
class CrawlerconnectdatabaseItem(Item):
name = Field()
url = Field()
title = Field()
link = Field()
page_title = Field()
desc_link = Field()
body = Field()
news_headline = Field()
pass
models.py
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL
import settings
DeclarativeBase = declarative_base()
def db_connect():
return create_engine(URL(**settings.DATABASE))
def create_deals_table(engine):
DeclarativeBase.metadata.create_all(engine)
class Deals(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "news_data"
id = Column(Integer, primary_key=True)
body = Column('body', String)
pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Deals, db_connect, create_deals_table
class CrawlerconnectdatabasePipeline(object):
def __init__(self):
engine = db_connect()
create_deals_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
deal = Deals(**item)
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
spider
code for scrapy spider you will find it here
you need to add the ITEM_PIPELINES = {'crawlerconnectdatabase.pipelines.CrawlerconnectdatabasePipeline': 300,} to your settings.py

Categories