multi-page crawler gives wrong results - python

Thanks in advance for your usefull help!
We need to crawl and save on our MySQL Database all the product pages from the website https://www.astegiudiziarie.it/
This website does not have a sitemap so we chose as the data source the summary webpage https://www.astegiudiziarie.it/Immobili/Riepilogo
From here you can see we have the first page as region, then province, then district and finally the product page we need to crawl and save.
We are developing this by using Scrapy and Python 3.8.5
During the execution flow from regions-page to products-page (entries) I pass data through the argument meta.
When I tested and printed to a CSV file in the format 'region','province','district' I get columns with wrong values.
The problem is when I run from terminal scrapy crawl products -o f.csv :
The output-file contains a table of 'region','province','district' but the row content is not correctly displayed as expected.
I don't understand what the bug can be into this code,
I appreciate very much your response and support for creating a better web!
Thank you!
import scrapy
from scrapy.http.request import Request
protocol = 'http://'
domain = 'www.astegiudiziarie.it'
path = '/Immobili/Riepilogo'
target_url = protocol + domain + path
dev_entry_counter = 0
dev_entry_limit = 100
def file_debug (message) :
f = open('debug.txt', 'a')
f.write(message + "\n\n")
f.close()
class ProductsSpider (scrapy.Spider) :
name = 'products'
allowed_domains = [domain]
start_urls = [target_url]
def parse (self, response) :# Parsing of 'regione' (Layer 1)
regioni = response.xpath('//table[#id="panoramica"]/tbody/tr')
for regione in regioni :# Iterating rows
regione_name = regione.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l1 = regione.xpath('//td/a/#href').extract()
for href_l1 in hrefs_l1 :# Iterating columns
abs_href_l1 = target_url + href_l1
yield Request(url = abs_href_l1, callback = self.parse_provincia, meta = {'regione': regione_name})
def parse_provincia (self, response) :# Parsing of 'provincia' (Layer 2)
province = response.xpath('//table[#id="panoramica"]/tbody/tr')
for provincia in province :
provincia_name = provincia.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l2 = provincia.xpath('//td/a/#href').extract()
for href_l2 in hrefs_l2 :
abs_href_l2 = target_url + href_l2
yield Request(url = abs_href_l2, callback = self.parse_comune, meta = {'regione': response.meta['regione'],
'provincia' : provincia_name})
def parse_comune (self, response) :# Parsing of 'comune' (Layer 3)
comuni = response.xpath('//table[#id="panoramica"]/tbody/tr')
for comune in comuni :
comune_name = comune.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l3 = comune.xpath('//td/a/#href').extract()
for href_l3 in hrefs_l3 :
abs_href_l3 = protocol + domain + href_l3
yield Request(url = abs_href_l3, callback = self.parse_entries, meta = {'regione' : response.meta['regione'],
'provincia' : response.meta['provincia'],
'comune' : comune_name})
def parse_entries (self, response) :# Parsing of 'entries' (list of the products)
entries = response.xpath('//*[#class="listing-item"]')
properties = {}
properties['regione'] = response.meta['regione']
properties['provincia'] = response.meta['provincia']
properties['comune'] = response.meta['comune']
yield properties

The problem is that when you are looping through Selectors and calling the xpath method you should make your queries relative to the current selector using e.g. ./.
So in your parse method you should use
regione_name = regione.xpath('./th[#scope="rowgroup"]//text()').get()
other wise you'll just get the first th in the entire document.
Another tip for your use case, is to use response.follow instead of constructing the Requests as you are. For example your parse method (which is pretty much the same as your other methods) can become
def parse(self, response):
for regione in response.xpath('//table[#id="panoramica"]/tbody/tr'):
regione_name = regione.xpath('./th[#scope="rowgroup"]//text()').get()
if not regione_name:
continue
for link in regione.xpath("./td/a"):
yield response.follow(link, callback=self.parse_provincia, meta=...)

Related

Scrapy. Every time i yield request another function is triggered as well. Cant see why

Here is my spider
It is supposed to assign a list attained from google sheet to global variable denied. In the code this function is called just once , but in the logs it is executed as many times as post request to endpoint is executed (send_to_endpoint()). Where is the error?
import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
import json
from datetime import datetime
import json
import logging
import requests
# from scrapy.utils.project import get_project_settings
class Code1Spider(scrapy.Spider):
name = 'c_cointelegraph'
allowed_domains = ['cointelegraph.com']
start_urls = ['https://cointelegraph.com/press-releases/']
id = int(str(datetime.now().timestamp()).split('.')[0])
denied=[]
gs_id = ''
endpoint_url = ''
def parse(self, response):
#Returns settings values as dict
settings=self.settings.copy_to_dict()
self.gs_id = settings.get('GS_ID')
self.endpoint_url = settings.get('ENDPOINT_URL')
#assigns a list of stop words from GS to global variable
self.denied = self.load_gsheet()
for i in response.xpath('//a[#class="post-card-inline__title-link"]/#href').getall():
yield Request(response.urljoin(i), callback = self.parsed)
def parsed(self, response):
#set deny_domains to current domain so we could get all external urls
denied_domains = self.allowed_domains[0]
links = LinkExtractor(deny_domains=denied_domains,restrict_xpaths=('//article[#class="post__article"]'))
links = links.extract_links(response)
links = [i.url for i in links]
#checks the list of external links agains the list of stop words
links = [i for i in links if not any(b in i for b in self.denied)]
company = response.xpath('//h2//text()').getall()
if company: company = [i.split('About ')[-1].strip() for i in company if 'About ' in i.strip()]
if company: company = company[0]
else: company = ''
d = {'heading' : response.xpath('//h1[#class="post__title"]/text()').get().strip(),
'url' : response.url,
'pubDate' : self.get_pub_date(response.xpath('//script[contains(text(),"datePublished")]/text()').get()),
'links' : links,
'company_name' : company,
'ScrapeID' : self.id,
}
# is used for debuging. just to see printed item.
yield d
#create post request to endpoint
req = self.send_to_endpoint(d)
#send request to endpoint
yield req
def get_pub_date(self, d):
d = json.loads(d)
pub_date = d['datePublished']
return pub_date
def load_gsheet(self):
#Loads a list of stop words from predefined google sheet
gs_id=self.gs_id
url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gs_id)
r = requests.get(url)
denied = r.text.splitlines()[1:]
logging.info(denied)
return denied
def send_to_endpoint(self, d):
url = self.endpoint_url
r = scrapy.Request( url, method='POST',
body=json.dumps(d),
headers={'Content-Type':'application/json'},
dont_filter = True)
return r
Whenever I yield req, load_gsheet() function is running as well triggering google sheets. If I comment out yield req, load_gsheet() is called just once as it is supposed to be.
Why does this happen? I have triple check the code line by line, added comments. Have no idea what i miss.
This is happening because you don't assign a callback to the request object that you construct in the send_to_endpoint() method.
The default callback is the parse method so all of the requests created in the send_to_endpoint method are automatically being sent to the parse method which calls the load_gsheet method for every single one of those post requests.
The solution is to either take the load_gsheet call out of the parse method, or explicitly assign a callback to all of the POST requests that isn't self.parse.

speed up python scrapy crawler

I'm currently writing vacancies scraper with Scrapy to parse about 3M of vacancies item.
Now I'm on place when spider works and successfully scraping items and storing it tot postgreesql but the thing is it doing it pretty slow.
For 1 hr i stored only 12k vacancies so i'm really ti far from 3M of them.
Thing is that in the end i'm gonna need to scrape and update data once per day and with current performance I'm gonna need more than a day to just parse all data.
I'm new in data scraping so I may do some basic thing wrong and I'll be very gratefull if anybody can hel me.
Code of my spider:
import scrapy
import urllib.request
from lxml import html
from ..items import JobItem
class AdzunaSpider(scrapy.Spider):
name = "adzuna"
start_urls = [
'https://www.adzuna.ru/search?loc=136073&pp=10'
]
def parse(self, response):
job_items = JobItem()
items = response.xpath("//div[#class='sr']/div[#class='a']")
def get_redirect(url):
response = urllib.request.urlopen(url)
response_code = response.read()
result = str(response_code, 'utf-8')
root = html.fromstring(result)
final_url = root.xpath('//p/a/#href')[0]
final_final_url = final_url.split('?utm', 1)[0]
return final_final_url
for item in items:
id = None
data_aid = item.xpath(".//#data-aid").get()
redirect = item.xpath(".//h2/a/#href").get()
url = get_redirect(redirect)
url_header = item.xpath(".//h2/a/strong/text()").get()
if item.xpath(".//p[#class='as']/#data-company-name").get() == None:
company = item.xpath(".//p[#class='as']/text()").get()
else:
company = item.xpath(".//p[#class='as']/#data-company-name").get()
loc = item.xpath(".//p/span[#class='loc']/text()").get()
text = item.xpath(".//p[#class='at']/span[#class='at_tr']/text()").get()
salary = item.xpath(".//p[#class='at']/span[#class='at_sl']/text()").get()
job_items['id'] = id
job_items['data_aid'] = data_aid
job_items['url'] = url
job_items['url_header'] = url_header
job_items['company'] = company
job_items['loc'] = loc
job_items['text'] = text
job_items['salary'] = salary
yield job_items
next_page = response.css("table.pg td:last-child ::attr('href')").get()
if next_page is not None:
yield response.follow(next_page, self.parse)
Use indexes in your table
Insert in BULK instead of inserting one-by-one
Minimize use of meta in your Request
Use tuple instead of list where possible
Set CONCURRENT_ITEMS=100, setting it to higher decreases performance
Try to use less Middlewares and Pipielines
Set AUTOTHROTTLE_ENABLED=False in settings.py
Set TELNETCONSOLE_ENABLED=False in settings.py

scrapy not following links with no error

The url below is both used to extract content and be followed, but nothing happened after the content extracted. Don't know why it was not followed.
It seems no errors.
You run Request of author url twice. First time to scrape list of authors. Second time to scrape current author details. Dumping Scrapy stats (in the end of logging) show "dupefilter/filtered" count. It means scrapy filtered duplicate URLs. Scraping will work if you remove "parse_content" function and write code like this:
def parse(self,response):
if 'tags' in response.meta:
author = {}
author['url'] = response.url
name = response.css(".people-name::text").extract()
join_date = response.css(".joined-time::text").extract()
following_no = response.css(".following-number::text").extract()
followed_no = response.css(".followed-number::text").extract_first()
first_onsale = response.css(".first-onsale-date::text").extract()
total_no = response.css(".total-number::text").extract()
comments = total_no[0]
onsale = total_no[1]
columns = total_no[2]
ebooks = total_no[3]
essays = total_no[4]
author['tags'] = response.meta['tags']
author['name'] = name
author['join_date'] = join_date
author['following_no'] = following_no
author['followed_no'] = followed_no
author['first_onsale'] = first_onsale
author['comments'] = comments
author['onsale'] = onsale
author['columns'] = columns
author['ebooks'] = ebooks
author['essays'] = essays
yield author
authors = response.css('section.following-agents ul.bd li.item')
for author in authors:
tags = author.css('div.author-tags::text').extract_first()
url = author.css('a.lnk-avatar::attr(href)').extract_first()
yield response.follow(url=url, callback=self.parse, meta={'tags': tags})
Be carefull. I removed some lines during testing. You need to use random agents in HTTP headers, request delay or proxy. I run collection and now I got "403 Forbidden" status code.

How to limit number of followed pages per site in Python Scrapy

I am trying to build a spider that could efficiently scrape text information from many websites. Since I am a Python user I was referred to Scrapy. However, in order to avoid scraping huge websites, I want to limit the spider to scrape no more than 20 pages of a certain "depth" per website. Here is my spider:
class DownloadSpider(CrawlSpider):
name = 'downloader'
download_path = '/home/MyProjects/crawler'
rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True),)
def __init__(self, *args, **kwargs):
super(DownloadSpider, self).__init__(*args, **kwargs)
self.urls_file_path = [kwargs.get('urls_file')]
data = open(self.urls_file_path[0], 'r').readlines()
self.allowed_domains = [urlparse(i).hostname.strip() for i in data]
self.start_urls = ['http://' + domain for domain in self.allowed_domains]
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
self.fname = self.download_path + urlparse(response.url).hostname.strip()
open(str(self.fname)+ '.txt', 'a').write(response.url)
open(str(self.fname)+ '.txt', 'a').write('\n')
urls_file is a path to a text file with urls. I have also set the max depth in the settings file. Here is my problem: if I set the CLOSESPIDER_PAGECOUNT exception it closes the spider when the total number of scraped pages (regardless for which site) reaches the exception value. However, I need to stop scraping when I have scraped say 20 pages from each url.
I also tried keeping count with a variable like self.parsed_number += 1, but this didn't work either -- it seems that scrapy doesn't go url by url but mixes them up.
Any advice is much appreciated !
To do this you can create your own link extractor class based on SgmlLinkExtractor. It should look something like this:
from scrapy.selector import Selector
from scrapy.utils.response import get_base_url
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class LimitedLinkExtractor(SgmlLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
deny_extensions=None, max_pages=20):
self.max_pages=max_pages
SgmlLinkExtractor.__init__(self, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths,
tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value,
deny_extensions=deny_extensions)
def extract_links(self, response):
base_url = None
if self.restrict_xpaths:
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.restrict_xpaths
for f in sel.xpath(x).extract()
).encode(response.encoding, errors='xmlcharrefreplace')
else:
body = response.body
links = self._extract_links(body, response.url, response.encoding, base_url)
links = self._process_links(links)
links = links[0:self.max_pages]
return links
The code of this subclass completely based on the code of the class SgmlLinkExtractor. I've just added variable self.max_pages to the class constructor and line which cut the list of links in the end of extract_links method. But you can cut this list in more intelligent way.
I'd make per-class variable, initialize it with stats = defaultdict(int) and increment self.stats[response.url] (or may be the key could be a tuple like (website, depth) in your case) in parse_item.
This is how I imagine this - should work in theory. Let me know if you need an example.
FYI, you can extract base url and calculate depth with the help of urlparse.urlparse (see docs).

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Categories