Loading heroku pg database with scraped data from scrapy spider.\ - python

I am news to heroku pg. What I am doing here is I have written a scrapy crawler which runs without any errors. Problem is I want to put all the scraped data into my heroku postgres database. For doing that I somewhat followed this tutorial
When I run crawler on my local machne using scrapy crawl spidername it runs successfull but scraped data is not inserted neither any table is created on heroku database. I dont even get any errors on local terminal. this is what my code is...
settings.py
BOT_NAME = 'crawlerconnectdatabase'
SPIDER_MODULES = ['crawlerconnectdatabase.spiders']
NEWSPIDER_MODULE = 'crawlerconnectdatabase.spiders'
DATABASE = {'drivername': 'postgres',
'host': 'ec2-54-235-250-41.compute-1.amazonaws.com',
'port': '5432',
'username': 'dtxwjcycsaweyu',
'password': '***',
'database': 'ddcir2p1u2vk07'}
items.py
from scrapy.item import Item, Field
class CrawlerconnectdatabaseItem(Item):
name = Field()
url = Field()
title = Field()
link = Field()
page_title = Field()
desc_link = Field()
body = Field()
news_headline = Field()
pass
models.py
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL
import settings
DeclarativeBase = declarative_base()
def db_connect():
return create_engine(URL(**settings.DATABASE))
def create_deals_table(engine):
DeclarativeBase.metadata.create_all(engine)
class Deals(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "news_data"
id = Column(Integer, primary_key=True)
body = Column('body', String)
pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Deals, db_connect, create_deals_table
class CrawlerconnectdatabasePipeline(object):
def __init__(self):
engine = db_connect()
create_deals_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
deal = Deals(**item)
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
spider
code for scrapy spider you will find it here

you need to add the ITEM_PIPELINES = {'crawlerconnectdatabase.pipelines.CrawlerconnectdatabasePipeline': 300,} to your settings.py

Related

Error Connecting to SQL Server database using Sqlalchemy module in Python

I am stuck connecting to my database. I'm not sure why the error as follows keep coming out even though I've followed the documentation clearly.
sqlalchemy.exc.InterfaceError: (pyodbc.InterfaceError) ('IM002', '[IM002] [Microsoft][ODBC Driver Manager] Data source name not found and no default driver specified (0) (SQLDriverConnect)')
My connection string looks like this
from flask import Flask, request, jsonify
from flask_sqlalchemy import SQLAlchemy
from flask_marshmallow import Marshmallow
from sqlalchemy import desc, create_engine
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'mssql+pyodbc://localhost\SQLEXPRESS/master?driver=ODBC Driver 17 for SQL Server?Trusted_Connection=True'
app.config['SQLALCEHMY_MODIFICATIONS'] = False
db = SQLAlchemy()
ma = Marshmallow()
class Post(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.String(100))
description = db.Column(db.String(200))
author = db.Column(db.String(50))
def __init__(self, title, description, author):
self.title = title
self.description = description
self.author = author
class PostSchema(ma.Schema):
class Meta:
fields = ("title", "description", "author")
post_schema = PostSchema()
posts_schema = PostSchema(many=True)
#app.route('/', methods=['GET'])
def get_post():
return jsonify({"Hello": "World"})
#app.route('/post', methods=['POST'])
def add_post():
title = request.json['title']
description = request.json['description']
author = request.json['author']
my_post = Post(title, description, author)
db.session.add(my_post)
db.session.commit()
return post_schema.jsonify(my_post)
db.init_app(app)
ma.init_app(app)
if __name__ == "__main__":
app.run(debug=True)
Is there anywhere in which I left out? Whenever I tried to create a post request through postman, it will show the error as stated above. I am sure the ODBC driver exist and I've connected it to the local database that I have installed in my PC. Also, table Post is already created on my database such as follows:
USE [master]
GO
/****** Object: Table [dbo].[Post] Script Date: 23-Mar-22 1:30:52 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Post](
[id] [int] IDENTITY(1,1) NOT NULL,
[title] [nvarchar](200) NULL,
[description] [nvarchar](200) NULL,
[author] [nvarchar](200) NULL,
CONSTRAINT [PK_ID] PRIMARY KEY CLUSTERED
(
[id] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
) ON [PRIMARY]
GO
Thanks for any suggestions
As reviewed, The connection URI was missing escape of the backslash and '&' instead of '?' symbol. Thanks for the suggestions.

Python/Flask/SQLAlchemy data in instance list persisting after page reloads

Project structure
app/
-- entrypoint.py
-- database.py
-- models/
---- person.py
---- filtering_class.py
-- pages/
---- routes.py
Files included in the issues:
entrypoint.py
from flask import Flask
import models
app = Flask(__name__, instance_relative_config=True)
database.py
from flask import _app_ctx_stack
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, scoped_session
SQLALCHEMY_DATABASE_URI = 'sqlite:///persons.db'
SQLALCHEMY_TRACK_MODIFICATIONS = False
engine = create_engine(
SQLALCHEMY_DATABASE_URI,
connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine
)
session = scoped_session(
SessionLocal,
scopefunc=_app_ctx_stack.__ident_func__
)
Base = declarative_base()
routes.py
from flask import render_template, request
from entrypoint import app
from database import session
from models import Person
#app.route('/person')
def person():
response_id = request.args.get('id')
person = session.query(Person).filter_by(id=response_id).first()
person = Person.get_others(person=person)
return render_template('person.html',
person=person)
person.py
from database import Base, session
from models import FilteringClass
class Person(Base):
__tablename__ = 'persons'
id = Column(Integer, primary_key=True)
first_name = Column(String(80), nullable=True)
last_name = Column(String(80), nullable=True)
#staticmethod
def get_persons(filtering_class=None):
query = session.query(Person.id,
Person.first_name,
Person.last_name)
if filtering_class:
query = filter_class.apply_filters()
results = query.all()
#staticmethod
def get_others(person):
# Here lies the problem
filtering_class = FilteringClass()
# After first call, len(filtering_class.custom_expressions) == 0 (expected), after second call len(...) already == 1, etc
filtering_class.add_custom_expression(Person.id != person.id)
return Person.get_persons(filtering_class=filtering_class)
filtering_class.py
class FilteringClass(object):
def __init__(self,
custom_expressions=[]):
self.custom_expressions = custom_expressions
def apply_filters(self, query):
# Import Person here to avoid circular import issues in person.py
from models.person import Person
if self.custom_expressions:
for exp in self.custom_expressions:
query = query.filter(exp)
return query
def add_custom_expression(self, expression):
self.custom_expressions.append(expression)
Description
FilteringClass is used to filter a passed query argument. It has a method for users of the class to add their own BinaryExpressions to be applied when FilteringClass.apply_filters() is called.
The goal here is the retrieve all Persons that are not the same as the person who initiated the page request by using the FilteringClass to exclude Person objects with the same ID.
The problem
The expected behavior would be that on each request, a new FilteringClass is instantiated (see Person.get_others --> filtering_class = FilteringClass()).
At that point it is expected that the internal custom_expressions array inside filtering_class instance would be empty, as defined in its constructor.
However, everytime the page related to the /person route is reloaded and the instance of filtering_class is created, its custom_expressions array is already populated with the previously added custom expression. Meaning that on every page reload, filtering_class.custom_expressions will grow without ever coming back to an empty state.
What I tried
Reseting the custom_expressions to empty with self.custom_expressions = [] directly after filtering the passed query.
Calling session.close() right before returning the page template in /person endpoint.
Calling session.commit() right before returning the page template in /person endpoint (but I don't think I should have to commit anything for a SELECT statement anyways).
Sorry for the long post, I tried to include everything that could be useful (but please let me know if I should add anything).
UPDATE: Solution:
As per #larsks ' comment, the issue was not related to SQLAlchemy but to a Python gotcha with default mutable arguments (see: https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments).
All there was to do to fix the issue was:
Changing FilteringClass' constructor from:
def __init__(self,
custom_expressions=[]):
self.custom_expressions = custom_expressions
to:
def __init__(self,
custom_expressions=None):
self.custom_expressions = custom_expressions
Change the add_custom_expression method from:
def add_custom_expression(self, expression):
self.custom_expressions.append(expression)
to:
def add_custom_expression(self, expression):
if self.custom_expressions is None:
self.custom_expressions = []
self.custom_expressions.append(expression)

Flask-Restx not converting enum field type to JSON

I need help with Enum field type as it is not accepted by Swagger and I am getting error message **TypeError: Object or Type eGameLevel is not JSON serializable**. Below is the complete set of code for table. Complete set of code with DB table and sqlalchemy settings is provided. I already tried it with Marshmallow-Enum Flask package and it didn't worked. Looking for kind help with some explanation about the solution so I can learn it well. :-)
I am using MySQL with the Flask. In Postgres its pretty easy to manage all the choice fields. All I need is a working example or a link to repository where MySQL choice fields are showing up in swagger drop down.
My Model:
import enum
from app import db
from typing import List
class eGameLevel(enum.Enum):
BEGINNER = 'Beginner'
ADVANCED = 'Advanced'
class Game(Base):
__tablename__ = 'game_stage'
id = db.Column(db.Integer(), primary_key=True)
game_level= db.Column(db.Enum(eGameLevel),
default=eGameLevel.BEGINNER, nullable=False)
user_id = db.Column(db.Integer(), db.ForeignKey('users.id', ondelete='CASCADE'), nullable=False)
user = db.relationship('User', backref='game__level_submissions', lazy=True)
def __init__(self, game_level, user_id):
self.game_level = game_level
self.user_id = user_id
def __repr__(self):
return 'Game(game_level%s, ' \
'user_id%s'%(self.game_level,
self.user_id)
def json(self):
return {'game_level':self.game_level,
'user_id':self.user_id}
#classmethod
def by_game_id(cls, _id):
return cls.query.filter_by(id=_id)
#classmethod
def find_by_game_level(cls, game_level):
return cls.query.filter_by(game_level=game_level)
#classmethod
def by_user_id(cls, _user_id):
return cls.query.filter_by(user_id=_user_id)
#classmethod
def find_all(cls) -> List["Game"]:
return cls.query.all()
def save_to_db(self) -> None:
db.session.add(self)
db.session.commit()
def delete_from_db(self) -> None:
db.session.delete(self)
db.session.commit()
My Schema
from app import ma
from app.models import Gode
class GameSchema(ma.SQLAlchemyAutoSchema):
game = ma.Nested('GameSchema', many=True)
class Meta:
model = Game
load_instance = True
include_fk= True
My Resources:
from flask_restx import Resource, fields, Namespace
from app.models import Game
from app import db
from app.schemas import GameSchema
GAME_REQUEST_NOT_FOUND = "Game request not found."
GAME_REQUEST_ALREADY_EXSISTS = "Game request '{}' Already exists."
game_ns = Namespace('Game', description='Available Game Requests')
games_ns = Namespace('Game Requests', description='All Games Requests')
game_schema = GameSchema()
games_list_schema = GameSchema(many=True)
gamerequest = game_ns.model('Game', {
'game_level': fields.String('Game Level: Must be one of: BEGINNER, ADVANCED.'),
'user_id': fields.Integer,
})
class GameRequestsListAPI(Resource):
#games_ns.doc('Get all Game requests.')
def get(self):
return games_list_schema.dump(Game.find_all()), 200
#games_ns.expect(gamerequest)
#games_ns.doc("Create a Game request.")
def post(self):
game_json = request.get_json()
game_data = game_schema.load(game_json)
game_data.save_to_db()
return game_schema.dump(game_data), 201
Instead of trying to manage Enum fields for MySQL schema I suggest to use another table with backref to your eGameLevel. You can get rid of this whole fuss and also in future if you needed to add another option in your choice field you won't have to hardcode it.
Simply create a main table as Game and sub table as eGameLevel (with only one string field). You will be able to access choices from your Game table.
Whenever I get stuck I go to basics as mentioned in here.
I made a small example just to test the serialization of an Enum
from enum import Enum
import sqlalchemy as sa
from flask import Flask
from flask_restx import Api, Namespace, Resource
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
class eGameLevel(str, Enum):
BEGINNER = "Beginner"
ADVANCED = "Advanced"
engine = sa.create_engine("sqlite:///:memory:")
session = scoped_session(sessionmaker(bind=engine))
Base = declarative_base()
class Game(Base):
__tablename__ = "game"
id = sa.Column(sa.Integer, primary_key=True, autoincrement=True)
level = sa.Column(sa.Enum(eGameLevel), default=eGameLevel.BEGINNER, nullable=False)
def __repr__(self):
return f"Game(id={self.id}, level={self.level})"
def json(self):
data = {"id": self.id, "level": self.level}
return data
Base.metadata.create_all(engine)
g1 = Game(level=eGameLevel.BEGINNER)
g2 = Game(level=eGameLevel.ADVANCED)
session.add_all([g1, g2])
session.commit()
query_content = session.query(Game).all()
games_ns = Namespace("Game Requests", description="All Games Requests")
app = Flask(__name__)
api = Api(app)
#api.route("/game")
class GameRequestsListAPI(Resource):
#games_ns.doc("Get all Game requests.")
def get(self):
data = [x.json() for x in query_content]
return data, 200
app.run(debug=True)
This example works and I think the serialization is possible due to the str in the Enum declaration: class eGameLevel(str, Enum).
Instead of using Enum:
class eGameLevel(enum.Enum):
BEGINNER = 'Beginner'
ADVANCED = 'Advanced'
You can make use of dictionary:
eGameLevel = {"BEGINNER": 1, "ADVANCED": 2}
Then you can replace enum type for sql data model to String type as:
game_level= db.Column(db.Integer(),
default=eGameLevel["BEGINNER"], nullable=False)
And make appropriate checks using the defined dictionary throughout application. This will also solve issues with alembic as well for making db migrations.
You would also require modifying some of your python files. I would rather do it here directly, and then you can look up to modify them:
#Import at Resources
from flask import request
from app.models import Game, eGameLevel
Post Part:
# For post part
payload = request.json
game_obj = Game(game_level=eGameLevel[payload["game_level"]], user_id=payload["user_id"])
db.session.add(game_obj)
db.session.commit()
Furthermore, I did not understand what the from app.models import Gode meant.

Saving spider results to database

Currently thinking about a good way to save my scraped data into a database.
App flow:
Run spider (data scraper), file located in spiders/
When data has been collected successfully save the data/items (title, link, pubDate) to the database by use of the class in pipeline.py
I would like your help with on how to save the scraped data (title, link, pubDate) from spider.py into the database through pipeline.py, currently I have nothing connecting these files together. When the data has been successfully scraped pipelines needs to be notified, receive the data and save it
I'm very thankful for your help
Spider.py
import urllib.request
import lxml.etree as ET
opener = urllib.request.build_opener()
tree = ET.parse(opener.open('https://nordfront.se/feed'))
items = [{'title': item.find('title').text, 'link': item.find('link').text, 'pubdate': item.find('pubDate').text} for item in tree.xpath("/rss/channel/item")]
for item in items:
print(item['title'], item['link'], item['pubdate'])
Models.py
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL
from sqlalchemy import UniqueConstraint
import datetime
import settings
def db_connect():
"""
Performs database connection using database settings from settings.py.
Returns sqlalchemy engine instance
"""
return create_engine(URL(**settings.DATABASE))
DeclarativeBase = declarative_base()
# <--snip-->
def create_presstv_table(engine):
DeclarativeBase.metadata.create_all(engine)
def create_nordfront_table(engine):
DeclarativeBase.metadata.create_all(engine)
def _get_date():
return datetime.datetime.now()
class Nordfront(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "nordfront"
id = Column(Integer, primary_key=True)
title = Column('title', String)
description = Column('description', String, nullable=True)
link = Column('link', String, unique=True)
date = Column('date', String, nullable=True)
created_at = Column('created_at', DateTime, default=_get_date)
Pipeline.py
from sqlalchemy.orm import sessionmaker
from models import Nordfront, db_connect, create_nordfront_table
class NordfrontPipeline(object):
"""Pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_nordfront_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save data in the database.
This method is called for every item pipeline component.
"""
session = self.Session()
deal = Nordfront(**item)
if session.query(Nordfront).filter_by(link=item['link']).first() == None:
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
Settings.py
DATABASE = {'drivername': 'postgres',
'host': 'localhost',
'port': '5432',
'username': 'toothfairy',
'password': 'password123',
'database': 'news'}
As far as I understand, this is a Scrapy-specific question. If so, you just need to activate your pipeline in settings.py:
ITEM_PIPELINES = {
'myproj.pipeline.NordfrontPipeline': 100
}
This would let the engine know to send the crawled items to the pipeline (see control flow):
If we are not talking about Scrapy, then, call process_item() directly from your spider:
from pipeline import NordfrontPipeline
...
pipeline = NordfrontPipeline()
for item in items:
pipeline.process_item(item, None)
You may also remove the spider argument from the process_item() pipeline method since it is not used.

Scrapy and DuplicatesPipeline avoid saving duplicates to db

Currently my Scrapy library based spider is scraping a url (this url updates every minute with new items) and saving news list items to a database, the list is updated every hour and I am trying to avoid adding duplicates of these news items, through the use of "class DuplicatesPipeline(object):" in my pipelines.py
Currently my script is saving news items into the db, however it still saves duplicates.
Probably class DuplicatesPipeline is the wrong way to go since it does not seem to check against existing records in the database, it only checks against duplicates in current session.
Very thankful for your help
Model:
class Listitem(DeclarativeBase):
"""Sqlalchemy deals model"""
__tablename__ = "newsitems"
id = Column(Integer, primary_key=True)
title = Column('title', String)
description = Column('description', String, nullable=True)
link = Column('link', String, nullable=True)
date = Column('date', String, nullable=True)
Pipelines.py:
from sqlalchemy.orm import sessionmaker
from models import Presstv, db_connect, create_presstv_table
from scrapy import signals
from scrapy.exceptions import DropItem
class PressTvPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_presstv_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save deals in the database.
This method is called for every item pipeline component.
"""
session = self.Session()
deal = Presstv(**item)
try:
session.add(deal)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
I think you should:
add a UNIQUE constraint on link in your database
check something like db.query(Listitem).filter_by(link=item.link).first() is None in the pipeline
Your mechanism may be used as an optimization - if you find a copy in your instance cache, there is not need to run the query. But if there is no copy, run the query.

Categories