Source : https://docs.twisted.org/en/twisted-17.9.0/core/howto/time.html
My Code:
import scrapy
from scrapy.crawler import CrawlerRunner
from twisted.internet import task
from twisted.internet import reactor
class Livescores1(scrapy.Spider):
name = 'Total'
runner = CrawlerRunner(settings = {
"FEEDS": {
"Total.json": {"format": "json", "overwrite": True},
},
})
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-total')
def parse(self, response):
for total in response.css('td'):
yield{
'total': total.css('::text').get()
}
class Livescores2(scrapy.Spider):
name = 'Home'
runner = CrawlerRunner(settings = {
"FEEDS": {
"Home.json": {"format": "json", "overwrite": True},
},
})
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-home')
def parse(self, response):
for total in response.css('td'):
yield{
'total': total.css('::text').get()
}
class Livescores3(scrapy.Spider):
name = 'Away'
runner = CrawlerRunner(settings = {
"FEEDS": {
"Away.json": {"format": "json", "overwrite": True},
},
})
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-away')
def parse(self, response):
for total in response.css('td'):
yield{
'total': total.css('::text').get()
}
loopTimes = 99999999999999999999999999999
failInTheEnd = False
_loopCounter = 0
def runEverySecond():
"""
Called at ever loop interval.
"""
runner.crawl(Livescores1)
runner.crawl(Livescores2)
runner.crawl(Livescores3)
global _loopCounter
if _loopCounter < loopTimes:
_loopCounter += 1
print('A new second has passed.')
return
loop = task.LoopingCall(runEverySecond)
# Start looping every 1 second.
loopDeferred = loop.start(1.0)
reactor.run()
This code has an error
"runner" is not defined
All i want is to save crawling to different json files. For this I added a separate CrawlerRunner(settings) for each class. But I should add another runner = CrawlerRunner(settings) for runEverySecond function, otherwise the code does not work.
If I add this (although in these settings I can give a single json file name) all classes obey this rule and don't save data to different files!
This structure has a function that works repeatly. That has looping function for spiders.
Can I add different settings for all classes to CrawlerRunner(settings) which I create inside runEverySecond? Is this possible or should I try a different solution?
Could you help me to fix this?
Thank you very much
I am trying to scrape all cars from the website: www.webuycars.co.za
I am using scrapy to do this and each page has 24 vehicles that I want to send to a json file.
Through data analysis it seems that I am just scraping the first page only or overwriting the variable used to create the json file.
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
def start_requests(self):
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
response = json.loads(response.body)
cars = []
filename = "webuycar.json"
for item in range(0,6528,24):
response['total']['value']=item
cars.append(response['data'])
with open(filename, "w") as f:
json.dump(cars, f, indent=4)
for resp in response['data']:
yield {
'Title': resp['OnlineDescription']
}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()
I would like to fix this as it messes with the accuracy of the database I have created and makes redundant data prevalent.
I have looked at my json file to see if the issue was from extraction. It seems that my webscraper is the problem. I would appreciate some thoughts on this.
You shouldn't try to dump the data into a file from the parse method. You should either use command line arguments, or in the case when running as a script like in your example, you can use feed exports.
Like THis:
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
custom_settings = {"FEEDS": {
"webuycar.json":{
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4
}
}}
def start_requests(self):
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
data = response.json()
for item in range(0,6528,24):
data['total']['value']=item
yield data
for item in data['data']:
yield {'Title': item['OnlineDescription']}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()
Im not totally sure this solves your problem because you are still scraping a single url, but this should avoid overwriting the file.
Although I tested this and the output json file was 7143882 lines long
Update:
After taking a closer look at your code, I think that this is closer to what you are actually trying to achieve. This makes many calls to the api and extracts all 24 OnlineDescription fields from each api call response.
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
custom_settings = {"FEEDS": {
"webuycar.json":{
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4
}
}}
def start_requests(self):
for i in range(24,6528,24):
self.body["to"] = i
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
data = response.json()
for item in data['data']:
yield {"Title": item['OnlineDescription']}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()
I am trying to fetch dynamic phone number from this page (among others): https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html
The phone number appears after a click on the element div with the class page-action click-tel. I am trying to get to this data with scrapy_splash using a LUA script to execute a click.
After pulling splash on my ubuntu:
sudo docker run -d -p 8050:8050 scrapinghub/splash
Here is my code so far (I am using a proxy service) :
class company(scrapy.Spider):
name = "company"
custom_settings = {
"FEEDS" : {
'/home/ubuntu/scraping/europages/data/company.json': {
'format': 'jsonlines',
'encoding': 'utf8'
}
},
"DOWNLOADER_MIDDLEWARES" : {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
"SPLASH_URL" : 'http://127.0.0.1:8050/',
"SPIDER_MIDDLEWARES" : {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
"DUPEFILTER_CLASS" : 'scrapy_splash.SplashAwareDupeFilter',
"HTTPCACHE_STORAGE" : 'scrapy_splash.SplashAwareFSCacheStorage'
}
allowed_domains = ['www.europages.fr']
def __init__(self, company_url):
self.company_url = "https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html" ##forced
self.item = company_item()
self.script = """
function main(splash)
splash.private_mode_enabled = false
assert(splash:go(splash.args.url))
assert(splash:wait(0.5))
local element = splash:select('.page-action.click-tel')
local bounds = element:bounds()
element:mouse_click{x=bounds.width/2, y=bounds.height/2}
splash:wait(4)
return splash:html()
end
"""
def start_requests(self):
yield scrapy.Request(
url = self.company_url,
callback = self.parse,
dont_filter = True,
meta = {
'splash': {
'endpoint': 'execute',
'url': self.company_url,
'args': {
'lua_source': self.script,
'proxy': 'http://usernamepassword#proxyhost:port',
'html':1,
'iframes':1
}
}
}
)
def parse(self, response):
soup = BeautifulSoup(response.body, "lxml")
print(soup.find('div',{'class','page-action click-tel'}))
The problem is that it has no effect, I still have nothing as if no button were clicked.
Shouldn't the return splash:html() return the results of element:mouse_click{x=bounds.width/2, y=bounds.height/2} (as element:mouse_click() waits for the changes to appear) in response.body ?
Am I missing something here ?
Most times when sites load data dynamically, they do so via background XHR requests to the server. A close examination of the network tab when you click the 'telephone' button, shows that the browser sends an XHR request to the url https://www.europages.fr/InfosTelecomJson.json?uidsid=DEU241700-00101&id=1330. You can emulate the same in your spider and avoid using scrapy splash altogether. See sample implementation below using one url:
import scrapy
from urllib.parse import urlparse
class Company(scrapy.Spider):
name = 'company'
allowed_domains = ['www.europages.fr']
start_urls = ['https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html']
def parse(self, response):
# obtain the id and uuid to make xhr request
uuid = urlparse(response.url).path.split('/')[-1].rstrip('.html')
id = response.xpath("//div[#itemprop='telephone']/a/#onclick").re_first(r"event,'(\d+)',")
yield scrapy.Request(f"https://www.europages.fr/InfosTelecomJson.json?uidsid={uuid}&id={id}", callback=self.parse_address)
def parse_address(self, response):
yield response.json()
I get the response
{'digits': '+49 220 69 53 30'}
I have a website that's need to scrape the data
"https://www.forever21.com/us/shop/catalog/category/f21/sale#pageno=1&pageSize=120&filter=price:0,250&sort=5" but I cannot retrieve all the data it also has pagination and Its uses javascript as well.
any idea on how I will scrape all the items? Here's my code
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
item = GpdealsSpiderItem_f21()
f21_title = product_item_forever.css('p.p_name::text').extract_first()
f21_regular_price = product_item_forever.css('span.p_old_price::text').extract_first()
f21_sale_price = product_item_forever.css('span.p_sale.t_pink::text').extract_first()
f21_photo_url = product_item_forever.css('img::attr(data-original)').extract_first()
f21_description_url = product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first()
item['f21_title'] = f21_title
item['f21_regular_price'] = f21_regular_price
item['f21_sale_price'] = f21_sale_price
item['f21_photo_url'] = f21_photo_url
item['f21_description_url'] = f21_description_url
yield item
Please help Thank you
One of the first steps in web scraping project should be looking for an API that the website uses to get the data. Not only does it save you parsing HTML, using an API also saves provider's bandwidth and server load. To look for an API, use your browser's developer tools and look for XHR requests in the network tab. In your case, the web site makes POST requests to this URL:
https://www.forever21.com/eu/shop/Catalog/GetProducts
You can then simulate the XHR request in Scrapy to get the data in JSON format. Here's the code for the spider:
# -*- coding: utf-8 -*-
import json
import scrapy
class Forever21Spider(scrapy.Spider):
name = 'forever21'
url = 'https://www.forever21.com/eu/shop/Catalog/GetProducts'
payload = {
'brand': 'f21',
'category': 'sale',
'page': {'pageSize': 60},
'filter': {
'price': {'minPrice': 0, 'maxPrice': 250}
},
'sort': {'sortType': '5'}
}
def start_requests(self):
# scrape the first page
payload = self.payload.copy()
payload['page']['pageNo'] = 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': 1}
)
def parse(self, response):
# parse the JSON response and extract the data
data = json.loads(response.text)
for product in data['CatalogProducts']:
item = {
'title': product['DisplayName'],
'regular_price': product['OriginalPrice'],
'sale_price': product['ListPrice'],
'photo_url': 'https://www.forever21.com/images/default_330/%s' % product['ImageFilename'],
'description_url': product['ProductShareLinkUrl']
}
yield item
# simulate pagination if we are not at the end
if len(data['CatalogProducts']) == self.payload['page']['pageSize']:
payload = self.payload.copy()
payload['page']['pageNo'] = response.meta['pageNo'] + 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': payload['page']['pageNo']}
)
I am trying to get the cookies from a splash request, but I keep getting an error.
Here is the code I am using:
class P2PEye(scrapy.Spider):
name = 'p2peyeSpider'
allowed_domains = ['p2peye.com']
start_urls = ['https://www.p2peye.com/platform/h9/']
def start_requests(self):
script = '''
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return {
cookies = splash:get_cookies(),
}
end
'''
for url in self.start_urls:
yield SplashRequest(url, callback=self.parse, endpoint='render.html',args={'wait': 1, 'lua_source': script})
def parse(self, response):
print(response.request.headers.getlist('Set-Cookie'))
print(response.cookiejar)
This is my settings.py
SPLASH_URL = 'http://127.0.0.1:8050'
CRAWLERA_ENABLED= False
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100 }
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
COOKIES_ENABLED = True
COOKIES_DEBUG = True
SPLASH_COOKIES_DEBUG = True
The result of response.request.headers.getlist('Set-Cookie') is [],
and response.cookiejar got an error: AttributeError: 'SplashTextResponse' object has no attribute 'cookiejar'.
So how can I get the cookies without causing an error?
To access response.cookiejar you need to return SplashJsonResponse
try returning extra fields on your Lua script:
script = '''
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
'''
Using the LUA script below the response will be a dict with cookies located at key cookies
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return {
cookies = splash:get_cookies(),
}
end
So to access you should use
# d = requests.post('splash').json()
print(d['cookies'])