This is what the project tree looks like:
rym_chart_scraper
├───scrapy.cfg
├───rym_chart_scraper
│ ├───__init__.py
│ ├───items.py
│ ├───models.py
├───pipelines.py
├───settings.py
├───spiders
├───my_spider.py
├───__init__.py
pipelines.py
from models import TopAlbums, db_connect, create_topalbums_table
from sqlalchemy.orm import sessionmaker
class TopAlbumPipeline:
def __init__(self):
engine = db_connect()
create_topalbums_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
topalbums = TopAlbums(**item)
try:
session.add(topalbums)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
models.py
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
import settings
Base = declarative_base()
def db_connect():
return create_engine(URL(**settings.DATABASE))
def create_topalbums_table(engine):
Base.metadata.create_all(engine)
class TopAlbums(Base):
__tablename__ = 'top_albums'
id = Column(Integer, primary_key=True)
Artist = Column('Artist', String)
Album = Column('Album', String)
Chart_year = Column('Chart_year', String)
Genre = Column('Genre', String)
Ratings = Column('Ratings', Integer)
Reviews = Column('Reviews', Integer)
Date = Column('Date', DateTime)
And the spider:
from scrapy import Spider, Request
from rym_chart_scraper.utility import find_between, listToString
from rym_chart_scraper.items import TopAlbumChartItem
from datetime import datetime
class TopAlbumChartSpider(Spider):
name = "top_music_charts"
allowed_domains = ['rateyourmusic.com']
start_urls = [
"https://rateyourmusic.com/charts/top/album/all-time"
]
n_pages = 1
def parse(self, response):
for album, stats in zip(response.css('div.chart_main'),
response.css('div.chart_stats')):
...
yield item
next_page = response.css('a.navlinknext')[0].css(
'a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
self.n_pages += 1
if self.n_pages < 31:
yield Request(next_page, callback=self.parse)
When I run the scraper with:
scrapy crawl top_music_charts
I get the following import error.
2016-12-11 17:46:41 [twisted] CRITICAL:
Traceback (most recent call last):
File "/Users/baasman/anaconda/lib/python3.5/site-packages/twisted/internet/defer.py", line 1299, in _inlineCallbacks
result = g.send(result)
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/crawler.py", line 72, in crawl
self.engine = self._create_engine()
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/crawler.py", line 97, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/core/engine.py", line 69, in __init__
self.scraper = Scraper(crawler)
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/core/scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/middleware.py", line 34, in from_settings
mwcls = load_object(clspath)
File "/Users/baasman/anaconda/lib/python3.5/site-packages/scrapy/utils/misc.py", line 44, in load_object
mod = import_module(module)
File "/Users/baasman/anaconda/lib/python3.5/importlib/__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 986, in _gcd_import
File "<frozen importlib._bootstrap>", line 969, in _find_and_load
File "<frozen importlib._bootstrap>", line 958, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 673, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 665, in exec_module
File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
File "/Users/baasman/Documents/python-workspace/rym_chart_scraper/rym_chart_scraper/pipelines.py", line 1, in <module>
from models import TopAlbums, db_connect, create_topalbums_table
ImportError: No module named 'models'
Trying to import 'models' interactively from main doesn't give an error, just when running the actual spider from the command line. Is there something wrong with the structure of the project? Or is it some other silly mistake? For some reason I can not get past this.
Related
I am getting the following error using Djongo with Mongodb in a django server:
Watching for file changes with StatReloader
Performing system checks...
Exception in thread django-main-thread:
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 946, in run
self._target(*self._args, **self._kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/utils/autoreload.py", line 53, in wrapper
fn(*args, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/core/management/commands/runserver.py", line 118, in inner_run
self.check(display_num_errors=True)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/core/management/base.py", line 392, in check
all_issues = checks.run_checks(
File "/home/cchilders/.local/lib/python3.10/site-packages/django/core/checks/registry.py", line 70, in run_checks
new_errors = check(app_configs=app_configs, databases=databases)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/core/checks/urls.py", line 13, in check_url_config
return check_resolver(resolver)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/core/checks/urls.py", line 23, in check_resolver
return check_method()
File "/home/cchilders/.local/lib/python3.10/site-packages/django/urls/resolvers.py", line 408, in check
for pattern in self.url_patterns:
File "/home/cchilders/.local/lib/python3.10/site-packages/django/utils/functional.py", line 48, in __get__
res = instance.__dict__[self.name] = self.func(instance)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/urls/resolvers.py", line 589, in url_patterns
patterns = getattr(self.urlconf_module, "urlpatterns", self.urlconf_module)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/utils/functional.py", line 48, in __get__
res = instance.__dict__[self.name] = self.func(instance)
File "/home/cchilders/.local/lib/python3.10/site-packages/django/urls/resolvers.py", line 582, in urlconf_module
return import_module(self.urlconf_name)
File "/usr/lib/python3.10/importlib/__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "/home/cchilders/projects/stocks_backend/dividends_project/urls.py", line 23, in <module>
path('users/', include('users.urls', namespace='users')),
File "/home/cchilders/.local/lib/python3.10/site-packages/django/urls/conf.py", line 34, in include
urlconf_module = import_module(urlconf_module)
File "/usr/lib/python3.10/importlib/__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "/home/cchilders/projects/stocks_backend/users/urls.py", line 3, in <module>
from . import views
File "/home/cchilders/projects/stocks_backend/users/views.py", line 10, in <module>
from djongo import transaction
File "/home/cchilders/.local/lib/python3.10/site-packages/djongo/transaction.py", line 2, in <module>
from djongo import djongo_access_url
ImportError: cannot import name 'djongo_access_url' from 'djongo' (/home/cchilders/.local/lib/python3.10/site-packages/djongo/__init__.py)
My users/views.py:
from django.shortcuts import render
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
import json
from helpers.view_functions import parse_request_body
from .models import UserProfile
from djongo import transaction
#csrf_exempt
def get_user_profile(request, user_id):
# pass
if request.method == 'GET':
with transaction.atomic():
profile, created = UserProfile.objects.get_or_create(user_id=user_id)
if created:
profile.user_id = user_id
profile.searches = [
{'search_term': 'hd'},
{'search_term': 'wba'},
]
profile.display_settings = [
{'setting_name': 'showYieldChange', 'visible': True},
{'setting_name': 'showAllDividends', 'visible': True},
]
profile.save()
print("user saved in db")
user = UserProfile.objects.get(user_id=user_id)
data = {
'user_id': user.user_id,
'searches': user.searches,
'display_settings': user.display_settings
}
json_data = json.dumps(data)
return HttpResponse({json_data}, content_type='application/json')
if request.method == 'POST':
body = parse_request_body(request)
searches = body['searches']
searches_objects = [{'search_term': x} for x in searches]
print("New searches for user {user_id}".format(user_id=user_id))
print(searches_objects)
user = UserProfile.objects.get(user_id=user_id)
user.searches = searches_objects
user.display_settings = body['display_settings']
user.save()
return HttpResponse("it worked")
users/urls.py:
from django.urls import path
from . import views
app_name = 'dividends'
urlpatterns = [
path('<str:user_id>', views.get_user_profile, name='get_user_profile'),
]
requirements.txt:
bs4
django==3.1.12
django-cors-headers
djongo
gunicorn
html5lib
pymongo==3.12.3
python-decouple
yfinance
users/models.py:
from djongo import models
class RecentSearch(models.Model):
search_term = models.CharField(max_length=100)
class Meta:
abstract = True
class DisplaySetting(models.Model):
setting_name = models.CharField(max_length=150)
visible = models.BooleanField()
class Meta:
abstract = True
class UserProfile(models.Model):
user_id = models.CharField(max_length=255, unique=True)
searches = models.ArrayField(model_container=RecentSearch, null=True)
display_settings = models.ArrayField(model_container=DisplaySetting, null=True)
objects = models.DjongoManager()
in ipython:
IPython 8.4.0 -- An enhanced Interactive Python. Type '?' for help.
In [1]: import djongo
In [2]: from djongo import transaction
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Input In [2], in <cell line: 1>()
----> 1 from djongo import transaction
File ~/.local/lib/python3.10/site-packages/djongo/transaction.py:2, in <module>
1 from djongo.exceptions import NotSupportedError
----> 2 from djongo import djongo_access_url
4 print(f'This version of djongo does not support transactions. Visit {djongo_access_url}')
5 raise NotSupportedError('transactions')
ImportError: cannot import name 'djongo_access_url' from 'djongo' (/home/cchilders/.local/lib/python3.10/site-packages/djongo/__init__.py)
In [3]:
correctly using get_or_create and only saving new data if created=True solves the bug of duplicate models when there should only be one unique one
#csrf_exempt
def get_user_profile(request, user_id):
pass
if request.method == 'GET':
# with transaction.atomic():
profile, created = UserProfile.objects.get_or_create(user_id=user_id)
if created:
profile.user_id = user_id
profile.searches = [
{'search_term': 'hd'},
{'search_term': 'wba'},
]
profile.display_settings = [
{'setting_name': 'showYieldChange', 'visible': True},
{'setting_name': 'showAllDividends', 'visible': True},
]
profile.save()
print("user saved in db")
# user = UserProfile.objects.get(user_id=user_id)
data = {
'user_id': profile.user_id,
'searches': profile.searches,
'display_settings': profile.display_settings
}
json_data = json.dumps(data)
return HttpResponse({json_data}, content_type='application/json')
you must set your unique attribute like user_id or stock ticker to unique=True
#csrf_exempt
def get_user_profile(request, user_id):
pass
if request.method == 'GET':
# with transaction.atomic():
profile, created = UserProfile.objects.get_or_create(user_id=user_id)
if created:
profile.user_id = user_id
profile.searches = [
{'search_term': 'hd'},
{'search_term': 'wba'},
]
profile.display_settings = [
{'setting_name': 'showYieldChange', 'visible': True},
{'setting_name': 'showAllDividends', 'visible': True},
]
profile.save()
print("user saved in db")
# user = UserProfile.objects.get(user_id=user_id)
data = {
'user_id': profile.user_id,
'searches': profile.searches,
'display_settings': profile.display_settings
}
json_data = json.dumps(data)
return HttpResponse({json_data}, content_type='application/json')
models
class UserProfile(models.Model):
user_id = models.CharField(max_length=255, unique=True)
searches = models.ArrayField(model_container=RecentSearch, null=True)
display_settings = models.ArrayField(model_container=DisplaySetting, null=True)
objects = models.DjongoManager()
i'm learning to automate some web tasks, and i have this code to fill out a form using python scrapy. It takes essentially mongolian cyrrilic sentences and translate it into traditional mongolian script. What i want to do is to read the sentences into a list from a file ("test.txt") and then to get the translation back and print it in the console.
import scrapy
class BichigSpider(scrapy.Spider):
name = "bichig"
allowed_domains = ["http://trans.mglip.com"]
start_urls = ["http://trans.mglip.com/EnglishC2T.aspx"]
def LoadListofSentences(self):
output = []
with open('test.txt', 'r',encoding='utf-8') as f:
for el in f:
output.append(el)
return output
def parse(self, response):
ListeSent = self.LoadListofSentences()
for sent in ListeSent:
formdata = {'inputCyrillic_ID': sent}
yield scrapy.http.FormRequest.from_response(response,
formdata=formdata,
clickdata={'name': 'ButtonTran_ID'},
callback=self.parse1)
def parse1(self, response):
print(response.css('outPutTraditonalM_ID::text').get())
Content of file "test.txt", located in the folder spiders with the init file:
Хоёр мянга арван гурван оны өвлийн цагаан будан татсан гэрэлт өдөр Өвөрхангай аймгийн театрт Монгол найргийн дархан цэц Дэндэвийн Пүрэвдорж гуайн нэрэмжит “Болор цом” наадмыг жулдрайхан би эхлүүлж байлаа.
Үндсэндээ түрүү жилийн эзэн дараа жилийнхээ цомыг нээдэг тэрхүү уламжлалын дагуу 30 жилийнх нь тэгш ойд түрүүлсний хувьд Пүрэвдорж гуайнхаа наадмыг “Өвгөд минь өндрийн салхи болохуйд” шүлгээрээ нээж байсан.
Тэрхүү хувь заяагаа би дандаа сүслэн боддог.
Пүрэвдорж гуай өөрөө санаачилж эхлүүлсэн, анхны түрүүг нь хүртсэн авшигтай эл наадмыг 80 насных нь их ойгоор өөрийнх нь нэрэмжит болон хүмүүн биеийг олсон халуун голомт Өврийн хангай нутагт нь болоход нээнэ гэдэг хувь заяа гэхээс өөр яалтай.
Шүлгээ дуудчихаад, шүлгээ уншихаар гараанаас эргэх гэж буй морьд шиг тогтож ядан байгаа найрагчдынхаа дунд орж ирэхэд омог төгөлдөр байсан даа.
Эрдэнэ-Очир ах минь, Хөөдөө ах минь, дархад Мийгаа ах минь, Лхамсүрэнжавын Ганзул ах минь бүгд шүлгээ унших гээд ирийтэл зогсож байсан сан.
Мөн ч алтанхан сайхан он жил байж шүү.
Наадмын урьд өдөр “Уран үгсийн чуулган”-д Монголын яруу найргийн их оргилууд Бавуугийн Лхагвасүрэн, Тангадын Галсан, Пунцагийн Бадарч нарын тоосон дунд орж цомын эзэн хэмээн Ичинхорлоо найрагчаар зарлуулж байснаа одоо эргээд бодох нь ээ, айх ч шиг.
Их найргийн бурхдын сүрд дарагдсан бахдал дүүрэн он цаг минь.
But i get this error after writing scrapy crawl bichig in the command line:
Traceback (most recent call last):
File "C:\Users\User\anaconda3\envs\Test\Scripts\scrapy-script.py", line 10, in <module>
sys.exit(execute())
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\cmdline.py", line 124, in execute
cmds = _get_commands_dict(settings, inproject)
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\cmdline.py", line 52, in _get_commands_dict
cmds = _get_commands_from_module('scrapy.commands', inproject)
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\cmdline.py", line 33, in _get_commands_from_module
for cmd in _iter_command_classes(module):
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\cmdline.py", line 20, in _iter_command_classes
for module in walk_modules(module_name):
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\utils\misc.py", line 88, in walk_modules
submod = import_module(fullpath)
File "C:\Users\User\anaconda3\envs\Test\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\commands\parse.py", line 10, in <module>
from scrapy.utils import display
File "C:\Users\User\anaconda3\envs\Test\lib\site-packages\scrapy\utils\display.py", line 5, in <module>
import ctypes
File "C:\Users\User\anaconda3\envs\Test\lib\ctypes\__init__.py", line 8, in <module>
from _ctypes import Union, Structure, Array
ImportError: DLL load failed while importing _ctypes:the specified module could not be found
Can somebody help me please with this or any other viable solution?
Solved! I needed to reinstall python that's all.
I want to add one of my models to the admin panel, but this error falls:
> Traceback (most recent call last): File
> "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\threading.py",
> line 932, in _bootstrap_inner
> self.run() File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\threading.py",
> line 870, in run
> self._target(*self._args, **self._kwargs) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\utils\autoreload.py",
> line 53, in wrapper
> fn(*args, **kwargs) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\core\management\commands\runserver.py",
> line 109, in inner_run
> autoreload.raise_last_exception() File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\utils\autoreload.py",
> line 76, in raise_last_exception
> raise _exception[1] File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\core\management\__init__.py",
> line 357, in execute
> autoreload.check_errors(django.setup)() File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\utils\autoreload.py",
> line 53, in wrapper
> fn(*args, **kwargs) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\__init__.py",
> line 24, in setup
> apps.populate(settings.INSTALLED_APPS) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\apps\registry.py",
> line 122, in populate
> app_config.ready() File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\contrib\admin\apps.py",
> line 24, in ready
> self.module.autodiscover() File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\contrib\admin\__init__.py",
> line 26, in autodiscover
> autodiscover_modules('admin', register_to=site) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\utils\module_loading.py",
> line 47, in autodiscover_modules
> import_module('%s.%s' % (app_config.name, module_to_search)) File
> "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\importlib\__init__.py",
> line 127, in import_module
> return _bootstrap._gcd_import(name[level:], package, level) File "<frozen importlib._bootstrap>", line 1014, in _gcd_import File
> "<frozen importlib._bootstrap>", line 991, in _find_and_load File
> "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
> File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
> File "<frozen importlib._bootstrap_external>", line 783, in
> exec_module File "<frozen importlib._bootstrap>", line 219, in
> _call_with_frames_removed File "C:\Users\smirn\OneDrive\Desktop\SYZYGY\Coding\Python\Django\Megan\Fridge\admin.py",
> line 13, in <module>
> admin.site.register([Product, Fridge, ProductObject]) File "C:\Users\smirn\AppData\Local\Programs\Python\Python38-32\lib\site-packages\django\contrib\admin\sites.py",
> line 104, in register
> if model._meta.abstract: AttributeError: type object 'ProductObject' has no attribute '_meta'
models.py:
from django.db import models as m
from django.conf import settings
import datetime
def mounth():
now = datetime.datetime.now()
return now + datetime.timedelta(days=20)
class Product(m.Model):
product_name = m.CharField(max_length=200)
product_calories = m.PositiveIntegerField(blank=True)
def __str__(self):
return self.product_name
class Fridge(m.Model):
OPTIONS = (
("1", "BASIC"),
("2", "PRO"),
("3", "KING"),
)
fridge_owner = m.ForeignKey(settings.AUTH_USER_MODEL, on_delete=m.CASCADE)
fridge_mode = m.CharField(max_length=5, choices=OPTIONS)
class Recipe(m.Model):
recipe_name = m.CharField(max_length=200)
recipe_products = m.ManyToManyField(Product)
recipe_description = m.TextField()
def __str__(self):
return self.recipe_name
class ProductObject(): # Не знаю как сделать правильно. Вдруг это можно реализовать по другому
product_obj_fridge = m.ForeignKey(Fridge, on_delete=m.CASCADE)
product_obj_product = m.ManyToManyField(Product)
product_shelf_life = m.DateField(default=mounth())
product_count = m.PositiveIntegerField(default=1)
class Meta:
ordering = ('product_shelf_life', )
admin.py:
from django.contrib import admin
from .models import Product, Fridge, Recipe, ProductObject
from tinymce.widgets import TinyMCE
from django.db import models
# Register your models here.
class RecipeAdmin(admin.ModelAdmin):
formfield_overrides = {
models.TextField: {'widget': TinyMCE}
}
admin.site.register([Product, Fridge, ProductObject])
admin.site.register(Recipe, RecipeAdmin)
If I remove the ProductObject in the registration in the admin panel, then there will be no error, but I do not understand this error at all. It seems that everything should be correct, but for some reason not
Please, help me!
In model ProductObject you are missing m.Model in the definition.
Without this the Meta field can not be constructed.
I am creating my first spider
I am following and writing code as per in Scrapy tutorial in docs.scrapy
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I wrote the below code and saved in C:\Users\DODAAD\scrapy_goodfriday_A01\scrapy_goodfriday_A01\spiders as quotes_spider.py
I have done as per instruction but this error pops up in cmd
(base) C:\Users\DODAAD\scrapy_goodfriday_A01>scrapy crawl quotes
Traceback (most recent call last):
File "C:\Users\DODAAD\Anaconda3\Scripts\scrapy-script.py", line 10, in <module>
sys.exit(execute())
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\cmdline.py", line 144, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\crawler.py", line 280, in __init__
super(CrawlerProcess, self).__init__(settings)
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\crawler.py", line 152, in __init__
self.spider_loader = self._get_spider_loader(settings)
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\crawler.py", line 146, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\spiderloader.py", line 68, in from_settings
return cls(settings)
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\spiderloader.py", line 24, in __init__
self._load_all_spiders()
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\spiderloader.py", line 51, in
_load_all_spiders
for module in walk_modules(name):
File "C:\Users\DODAAD\Anaconda3\lib\site-packages\scrapy\utils\misc.py", line 78, in walk_modules
submod = import_module(fullpath)
File "C:\Users\DODAAD\Anaconda3\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 677, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 728, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "C:\Users\DODAAD\scrapy_goodfriday_A01\scrapy_goodfriday_A01\spiders\quotes_spider-
checkpoint.py",
line 33, in <module>
"execution_count": null,
NameError: name 'null' is not defined
NameError: name 'null' is not defined
I don't understand what meant by name error null
Fix indents for your class:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I'm having this error while trying to run my Teonite_project/web_scrapper.py script:
File "C:/Users/kfhei/Desktop/Teonite_project/Teonite_project/web_scrapper.py", line 9, in <module>
django.setup()
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\__init__.py", line 19, in setup
configure_logging(settings.LOGGING_CONFIG, settings.LOGGING)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 56, in __getattr__
self._setup(name)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 43, in _setup
self._wrapped = Settings(settings_module)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 106, in __init__
mod = importlib.import_module(self.SETTINGS_MODULE)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 994, in _gcd_import
File "<frozen importlib._bootstrap>", line 971, in _find_and_load
File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 678, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "C:\Users\kfhei\Desktop\Teonite_project\Teonite_project\Teonite_project\settings.py", line 15, in <module>
django.setup()
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\__init__.py", line 19, in setup
configure_logging(settings.LOGGING_CONFIG, settings.LOGGING)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 56, in __getattr__
self._setup(name)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 43, in _setup
self._wrapped = Settings(settings_module)
File "C:\Users\kfhei\Desktop\Teonite_project\env\lib\site-packages\django\conf\__init__.py", line 125, in __init__
raise ImproperlyConfigured("The SECRET_KEY setting must not be empty.")
django.core.exceptions.ImproperlyConfigured: The SECRET_KEY setting must not be empty.
My script:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
#import simplejson as json
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Teonite_project.settings")
import django
django.setup()
from words.models import Word
from authors.models import Author
URL_DOMAIN = 'https://teonite.com/blog/'
def get_links(url):
'''
Returning the array of links
to blog articles from the website
'''
html = get_html(url)
links = []
for link in html.findAll('a'):
link = link.get('href')
if link == None or link[6:9] == 'tag' or link[6:10]=='page':
pass
elif link[:6] == '/blog/':
link = url[:19] + link
links.append(link)
return links
def get_html(url):
'''
Returning the HTML of the website
'''
req = Request(url)
html_page = urlopen(req)
html = BeautifulSoup(html_page, "html.parser")
return html
def get_text(url):
'''
Extracting the post content of
the articles from the blog
'''
html = get_html(url)
text =''
for content in html.select('.post-content'):
text = text + content.text
return content.text
def get_author(url):
'''
Extracting the name of the Author
from the articles
'''
html = get_html(url)
for author in html.select('.author-content'):
return author.text
if __name__ == '__main__':
'''
Main function tasks:
* Extract the neccessary data from the website,
* Save it to the database
'''
links = get_links(URL_DOMAIN)
author_dict_database = {}
word_dict_database = {}
for link in links:
text = get_text(link).strip()
wordslist = text.split()
author = get_author(link).strip()
for word in wordslist:
if not word.isalpha():
wordslist.remove(word)
word = word.lower()
if author in author_dict_database:
for word in wordslist:
if word in word_dict_database:
author_dict_database[author][word] += 1
else:
author_dict_database[author][word] = 1
else:
for word in wordslist:
if word in word_dict_database:
word_dict_database[word] += 1
else:
word_dict_database[word] = 1
author_dict_database[author] = word_dict_database
#Saving values to postgres database
for key_author,word_dict in author_dict_database:
database_author = Author(author=key_author)
for key_word, value_count in word_dict:
database_word = Word(author=key_author, words_list=key_word, words_count=value_count)
As you can seen I tried different things to make it work. I've read several topics in stackoverflow and tried to search in different websites.
Also I have configured the wsgi.py file
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Teonite_project.settings")
Unfortunately I have no idea why this happens, because I have set the SECRET_KEY in my Teonite_project/Teonite_project/settings.py
Basically all I want is to run my script, and add the scrapped values to my postgres database in my authors, and words models.