The url below is both used to extract content and be followed, but nothing happened after the content extracted. Don't know why it was not followed.
It seems no errors.
You run Request of author url twice. First time to scrape list of authors. Second time to scrape current author details. Dumping Scrapy stats (in the end of logging) show "dupefilter/filtered" count. It means scrapy filtered duplicate URLs. Scraping will work if you remove "parse_content" function and write code like this:
def parse(self,response):
if 'tags' in response.meta:
author = {}
author['url'] = response.url
name = response.css(".people-name::text").extract()
join_date = response.css(".joined-time::text").extract()
following_no = response.css(".following-number::text").extract()
followed_no = response.css(".followed-number::text").extract_first()
first_onsale = response.css(".first-onsale-date::text").extract()
total_no = response.css(".total-number::text").extract()
comments = total_no[0]
onsale = total_no[1]
columns = total_no[2]
ebooks = total_no[3]
essays = total_no[4]
author['tags'] = response.meta['tags']
author['name'] = name
author['join_date'] = join_date
author['following_no'] = following_no
author['followed_no'] = followed_no
author['first_onsale'] = first_onsale
author['comments'] = comments
author['onsale'] = onsale
author['columns'] = columns
author['ebooks'] = ebooks
author['essays'] = essays
yield author
authors = response.css('section.following-agents ul.bd li.item')
for author in authors:
tags = author.css('div.author-tags::text').extract_first()
url = author.css('a.lnk-avatar::attr(href)').extract_first()
yield response.follow(url=url, callback=self.parse, meta={'tags': tags})
Be carefull. I removed some lines during testing. You need to use random agents in HTTP headers, request delay or proxy. I run collection and now I got "403 Forbidden" status code.
Related
Thanks in advance for your usefull help!
We need to crawl and save on our MySQL Database all the product pages from the website https://www.astegiudiziarie.it/
This website does not have a sitemap so we chose as the data source the summary webpage https://www.astegiudiziarie.it/Immobili/Riepilogo
From here you can see we have the first page as region, then province, then district and finally the product page we need to crawl and save.
We are developing this by using Scrapy and Python 3.8.5
During the execution flow from regions-page to products-page (entries) I pass data through the argument meta.
When I tested and printed to a CSV file in the format 'region','province','district' I get columns with wrong values.
The problem is when I run from terminal scrapy crawl products -o f.csv :
The output-file contains a table of 'region','province','district' but the row content is not correctly displayed as expected.
I don't understand what the bug can be into this code,
I appreciate very much your response and support for creating a better web!
Thank you!
import scrapy
from scrapy.http.request import Request
protocol = 'http://'
domain = 'www.astegiudiziarie.it'
path = '/Immobili/Riepilogo'
target_url = protocol + domain + path
dev_entry_counter = 0
dev_entry_limit = 100
def file_debug (message) :
f = open('debug.txt', 'a')
f.write(message + "\n\n")
f.close()
class ProductsSpider (scrapy.Spider) :
name = 'products'
allowed_domains = [domain]
start_urls = [target_url]
def parse (self, response) :# Parsing of 'regione' (Layer 1)
regioni = response.xpath('//table[#id="panoramica"]/tbody/tr')
for regione in regioni :# Iterating rows
regione_name = regione.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l1 = regione.xpath('//td/a/#href').extract()
for href_l1 in hrefs_l1 :# Iterating columns
abs_href_l1 = target_url + href_l1
yield Request(url = abs_href_l1, callback = self.parse_provincia, meta = {'regione': regione_name})
def parse_provincia (self, response) :# Parsing of 'provincia' (Layer 2)
province = response.xpath('//table[#id="panoramica"]/tbody/tr')
for provincia in province :
provincia_name = provincia.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l2 = provincia.xpath('//td/a/#href').extract()
for href_l2 in hrefs_l2 :
abs_href_l2 = target_url + href_l2
yield Request(url = abs_href_l2, callback = self.parse_comune, meta = {'regione': response.meta['regione'],
'provincia' : provincia_name})
def parse_comune (self, response) :# Parsing of 'comune' (Layer 3)
comuni = response.xpath('//table[#id="panoramica"]/tbody/tr')
for comune in comuni :
comune_name = comune.xpath('//th[#scope="rowgroup"]//text()').extract_first()
hrefs_l3 = comune.xpath('//td/a/#href').extract()
for href_l3 in hrefs_l3 :
abs_href_l3 = protocol + domain + href_l3
yield Request(url = abs_href_l3, callback = self.parse_entries, meta = {'regione' : response.meta['regione'],
'provincia' : response.meta['provincia'],
'comune' : comune_name})
def parse_entries (self, response) :# Parsing of 'entries' (list of the products)
entries = response.xpath('//*[#class="listing-item"]')
properties = {}
properties['regione'] = response.meta['regione']
properties['provincia'] = response.meta['provincia']
properties['comune'] = response.meta['comune']
yield properties
The problem is that when you are looping through Selectors and calling the xpath method you should make your queries relative to the current selector using e.g. ./.
So in your parse method you should use
regione_name = regione.xpath('./th[#scope="rowgroup"]//text()').get()
other wise you'll just get the first th in the entire document.
Another tip for your use case, is to use response.follow instead of constructing the Requests as you are. For example your parse method (which is pretty much the same as your other methods) can become
def parse(self, response):
for regione in response.xpath('//table[#id="panoramica"]/tbody/tr'):
regione_name = regione.xpath('./th[#scope="rowgroup"]//text()').get()
if not regione_name:
continue
for link in regione.xpath("./td/a"):
yield response.follow(link, callback=self.parse_provincia, meta=...)
I am building a spider with scrapy, I want to access in every item in a list and then scrape all the data inside each link. but when I run the spider it doesn´t scrape the data. What I am missing?
from ..items import JobscraperItem
from scrapy.linkextractors import LinkExtractor
class JobscraperSpider(scrapy.Spider):
name ='jobspider'
start_urls = ['https://cccc/bolsa/ofertas?oferta=&lugar=&categoria=']
def parse(self, response):
job_detail = response.xpath('//div[#class="list"]/div/a')
yield from response.follow_all(job_detail, self.parse_jobspider)
def parse(self, response):
items = JobscraperItem()
job_title = response.xpath('//h1/text()').extract()
company = response.xpath('//h2/b/text()').extract()
company_url = response.xpath('//div[#class="pull-left"]/a/text()').extract()
description = response.xpath('//div[#class="aviso"]/text()').extract()
salary = response.xpath('//div[#id="aviso"]/p[1]/text()').extract()
city = response.xpath('//div[#id="aviso"]/p[2]/text()').extract()
district = response.xpath('//div[#id="aviso"]/p[5]/text()').extract()
publication_date = response.xpath('//div[#id="publicado"]/text()').extract()
apply = response.xpath('//p[#class="text-center"]/b/text()').extract()
job_type = response.xpath('//div[#id="resumen"]/p[3]/text()').extract()
items['job_title'] = job_title
items['company'] = company
items['company_url'] = company_url
items['description'] = description
items['salary'] = salary
items['city'] = city
items['district'] = district
items['publication_date'] = publication_date
items['apply'] = apply
items['job_type'] = job_type
yield items```
From what I can see, one of the issues is that you are creating two functions called parse(). Since you are using a self.parse_jobspider in your first parse function, I'm guessing that your second parse function is named incorrectly.
Also, are you sure that the URL in the start_urls is correct? https://cccc/bolsa/ofertas?oferta=&lugar=&categoria= doesn't direct to anywhere which would also explain why data isn't being scraped.
rules = (
Rule(LinkExtractor(allow=('/bolsa/166',)), follow=True, callback='parse_item'),
)
I resolve this adding this code to access in every link and scrape the data inside
I am very new to web scraping , and I am trying to scrape this online forum: https://community.whattoexpect.com/forums/postpartum-depression.html
It is a two-level site where the main page is a list of discussion posts, and you can click on each post to get the full content and see the reply comments. The main site also has pagination.
I want my final CSV to look something like this:
The idea is to have the main post in one row, and then the replies in the next rows. I will be using the same ID for main post and replies, so that they can be linked.
Here is my Scrapy spider so far:
import scrapy
import datetime
class PeripartumSpider(scrapy.Spider):
name = 'peripartum'
start_urls = ['http://www.community.whattoexpect.com/forums/postpartum-depression.html']
def parse(self, response):
for post_link in response.xpath('//*[#id="group-discussions"]/div[3]/div/div/a/#href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[#class="page-link"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[#class='__messageContent fr-element fr-view']/p/text()").extract()
title = response.xpath("//*[#class='discussion-original-post__title']/text()").extract_first()
author_name = response.xpath("//*[#class='discussion-original-post__author__name']/text()").extract_first()
unixtime = response.xpath("//*[#class='discussion-original-post__author__updated']/#data-date").extract_first()
unixtime = int(unixtime) / 1000 # Removing milliseconds
timestamp = datetime.datetime.utcfromtimestamp(unixtime).strftime("%m/%d/%Y %H:%M")
replies_list = response.xpath("//*[#class='discussion-replies__list']").getall()
# Getting the comments and their information for each post
reply_post = response.xpath(".//*[#class='wte-reply__content__message __messageContent fr-element fr-view']/p/text()").extract()
reply_author = response.xpath("//*[#class='wte-reply__author__name']/text()").extract()
reply_time = response.xpath("//*[#class='wte-reply__author__updated']/#data-date").extract()
for reply in reply_time:
reply_date = int(reply_time) / 1000 # Removing milliseconds
reply_timestamp = datetime.datetime.utcfromtimestamp(reply_date).strftime("%m/%d/%Y %H:%M")
yield {
"title": title,
"author_name": author_name,
"time": timestamp,
"post": original_post,
"reply_author": reply_author,
"reply_timestamp": reply_timestamp,
"replies": reply_post
}
When I try to run my spider, I am getting 0 crawls. I am not sure if I am correctly following the links to each post. And, should I use something like Python's CSV library to get the comments to load into the next row but with the original post ID?
You have to take care about
the existing web page document structure
and your code structure parsing the content
May there is a better coding than the following, just identifying n comments and after that looping over the comments. In this case you don't need to zip the lists together. But you could use it as a starting point
import scrapy
import datetime
class PeripartumSpider(scrapy.Spider):
name = 'peripartum'
start_urls = ['https://community.whattoexpect.com/forums/postpartum-depression.html']
def parse(self, response):
for post_link in response.xpath('//*[#id="group-discussions"]/div[3]/div/div/a/#href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[#class="page-link"])[1]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[#class='__messageContent fr-element fr-view']/p/text()").extract()
title = response.xpath("//*[#class='discussion-original-post__title']/text()").extract_first()
author_name = response.xpath("//*[#class='discussion-original-post__author__name']/text()").extract_first()
unixtime = response.xpath("//*[#class='discussion-original-post__author__updated']/#data-date").extract_first()
unixtime = int(unixtime) / 1000 # Removing milliseconds
timestamp = datetime.datetime.utcfromtimestamp(unixtime).strftime("%m/%d/%Y %H:%M")
replies_list = response.xpath("//*[#class='discussion-replies__list']").getall()
# Getting the comments and their information for each post
replies_post = response.xpath(".//*[#class='wte-reply__content__message __messageContent fr-element fr-view']/p/text()").extract()
replies_author = response.xpath("//*[#class='wte-reply__author__name']/text()").extract()
replies_time = response.xpath("//*[#class='wte-reply__author__updated']/#data-date").extract()
replies = zip(replies_post, replies_author, replies_time)
for reply_post, reply_author, reply_time in replies:
reply_date = int(reply_time) / 1000 # Removing milliseconds
reply_timestamp = datetime.datetime.utcfromtimestamp(reply_date).strftime("%m/%d/%Y %H:%M")
yield {
"title": title,
"author_name": author_name,
"time": timestamp,
"post": original_post,
"reply_author": reply_author,
"reply_timestamp": reply_timestamp,
"replies": reply_post
}
You may also have to take care about pagination in comments.
I'm currently writing vacancies scraper with Scrapy to parse about 3M of vacancies item.
Now I'm on place when spider works and successfully scraping items and storing it tot postgreesql but the thing is it doing it pretty slow.
For 1 hr i stored only 12k vacancies so i'm really ti far from 3M of them.
Thing is that in the end i'm gonna need to scrape and update data once per day and with current performance I'm gonna need more than a day to just parse all data.
I'm new in data scraping so I may do some basic thing wrong and I'll be very gratefull if anybody can hel me.
Code of my spider:
import scrapy
import urllib.request
from lxml import html
from ..items import JobItem
class AdzunaSpider(scrapy.Spider):
name = "adzuna"
start_urls = [
'https://www.adzuna.ru/search?loc=136073&pp=10'
]
def parse(self, response):
job_items = JobItem()
items = response.xpath("//div[#class='sr']/div[#class='a']")
def get_redirect(url):
response = urllib.request.urlopen(url)
response_code = response.read()
result = str(response_code, 'utf-8')
root = html.fromstring(result)
final_url = root.xpath('//p/a/#href')[0]
final_final_url = final_url.split('?utm', 1)[0]
return final_final_url
for item in items:
id = None
data_aid = item.xpath(".//#data-aid").get()
redirect = item.xpath(".//h2/a/#href").get()
url = get_redirect(redirect)
url_header = item.xpath(".//h2/a/strong/text()").get()
if item.xpath(".//p[#class='as']/#data-company-name").get() == None:
company = item.xpath(".//p[#class='as']/text()").get()
else:
company = item.xpath(".//p[#class='as']/#data-company-name").get()
loc = item.xpath(".//p/span[#class='loc']/text()").get()
text = item.xpath(".//p[#class='at']/span[#class='at_tr']/text()").get()
salary = item.xpath(".//p[#class='at']/span[#class='at_sl']/text()").get()
job_items['id'] = id
job_items['data_aid'] = data_aid
job_items['url'] = url
job_items['url_header'] = url_header
job_items['company'] = company
job_items['loc'] = loc
job_items['text'] = text
job_items['salary'] = salary
yield job_items
next_page = response.css("table.pg td:last-child ::attr('href')").get()
if next_page is not None:
yield response.follow(next_page, self.parse)
Use indexes in your table
Insert in BULK instead of inserting one-by-one
Minimize use of meta in your Request
Use tuple instead of list where possible
Set CONCURRENT_ITEMS=100, setting it to higher decreases performance
Try to use less Middlewares and Pipielines
Set AUTOTHROTTLE_ENABLED=False in settings.py
Set TELNETCONSOLE_ENABLED=False in settings.py
As some of you may have gathered, I'm learning scrapy to scrape some data off of Google Scholar for a research project that I am running. I have a file that contains many article titles for which I am scraping citations. I read in the file using pandas, generate the URLs that need scraping, and start scraping.
One problem that I face is 503 errors. Google shuts me off fairly quickly, and many entries remain unscraped. This is a problem that I am working on using some middleware provided by Crawlera.
Another problem I face is that when I export my scraped data, I have a hard time matching the scraped data to what I was trying to look for. My input data is a CSV file with three fields -- 'Authors','Title','pid' where 'pid' is a unique identifier.
I use pandas to read in the file and generate URLs for scholar based off the title. Each time a given URL is scraped, my spider goes through the scholar webpage, and picks up the title, publication information and cites for each article listed on that page.
Here is how I generate the links for scraping:
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
# generate a var to store links
links = []
# create the URLs to crawl
for entry in queries:
links.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = links
For example, one title from my data file could be the paper 'Elephants Don't Play Chess' by Rodney Brooks with 'pid' 5067. The spider goes to
http://scholar.google.com/scholar?q=allintitle%3Aelephants+don%27t+play+chess
Now on this page, there are six hits. The spider gets all six hits, but they need to be assigned the same 'pid'. I know I need to insert a line somewhere that reads something like item['pid'] = data.pid.apply("something") but I can't figure out exactly how I would do that.
Below is the rest of the code for my spider. I am sure the way to do this is pretty straightforward, but I can't think of how to get the spider to know which entry of data.pid it should look for if that makes sense.
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
I hope this question is well-defined, but please let me know if I can be more clear. There is really not much else in my scraper except some lines at the top importing things.
Edit 1: Based on suggestions below, I have modified my code as follows:
# test-case: http://scholar.google.com/scholar?q=intitle%3Amigratory+birds
import re
from pandas import *
import urllib
from scrapy.spider import Spider
from scrapy.selector import Selector
from scholarscrape.items import ScholarscrapeItem
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
pid = data.pid
# generate a var to store links
urls = []
# create the URLs to crawl
for entry in queries:
urls.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = (
(urls, pid),
)
def make_requests_from_url(self, (url,pid)):
return Request(url, meta={'pid':pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
item['pid'] = response.meta['pid']
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
You need to populate your list start_urls with tuples (url, pid).
Now redefine the method make_requests_from_url(url):
class ScholarSpider(Spider):
name = "ScholarSpider"
allowed_domains = ["scholar.google.com"]
start_urls = (
('http://www.scholar.google.com/', 100),
)
def make_requests_from_url(self, (url, pid)):
return Request(url, meta={'pid': pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
pid = response.meta['pid']
print '!!!!!!!!!!!', pid, '!!!!!!!!!!!!'
pass