How to modify url before following it in scrapy? - python

I'm new with scrapy and this is my second spider:
class SitenameScrapy(scrapy.Spider):
name = "sitename"
allowed_domains = ['www.sitename.com', 'sitename.com']
rules = [Rule(LinkExtractor(unique=True), follow=True)]
def start_requests(self):
urls = ['http://www.sitename.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_cat)
def parse_cat(self, response):
links = LinkExtractor().extract_links(response)
for link in links:
if ('/category/' in link.url):
yield response.follow(link, self.parse_cat)
if ('/product/' in link.url):
yield response.follow(link, self.parse_prod)
def parse_prod(self, response):
pass
My problem is that sometimes I have links like http://sitename.com/path1/path2/?param1=value1&param2=value2 and for me, param1 is not important and I want to remove it from url before response.follow. I think I can do it with regex but I'm not sure that it is 'right way' for scrapy? Maybe I should use some kind of rule for this?

I think you could use the url_query_cleaner method from w3lib's library. Something like:
from w3lib.url import url_query_cleaner
...
....
def parse_cat(self, response):
links = LinkExtractor().extract_links(response)
for link in links:
url = url_query_cleaner(link.url, ('param2',))
if '/category/' in url:
yield response.follow(url, self.parse_cat)
if '/product/' in url:
yield response.follow(url, self.parse_prod)

Related

Interpreting callbacks and cb_kwargs with scrapy

I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()

What is this Scrapy error: ReactorNotRestartable?

I do not understand why my spider wont run. I tested the css selector separately, so I do not think it is the parsing method.
Traceback message:
ReactorNotRestartable:
class espn_spider(scrapy.Spider):
name = "fsu2021_spider"
def start_requests(self):
urls = "https://www.espn.com/college-football/team/_/id/52"
for url in urls:
yield scrapy.Request(url = url, callback = self.parse_front)
def parse(self, response):
schedule_link = response.css('div.global-nav-container li > a::attr(href)')
process = CrawlerProcess()
process.crawl(espn_spider)
process.start()
urls = "https://www.espn.com/college-football/team/_/id/52"
for url in urls:
You're going through the characters of "urls", change it to a list:
urls = ["https://www.espn.com/college-football/team/_/id/52"]
...
...
Also you don't have "parse_front" function, if you just didn't add it to the snippet then ignore this, if it was a mistake then change it to:
yield scrapy.Request(url=url, callback=self.parse)

Why only one result in loop scrapy

I'm trying to use scrapy to crawl some page with a lot of links inside, but my existing code so far only show the contents of the first link.
What mistake have I made?
from scrapy.spiders import BaseSpider
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.selector import Selector
from Proje.items import ProjeItem
class ProjeSpider(BaseSpider):
name = "someweb"
allowed_domains = ["someweb.com"]
start_urls = [
"http://someweb.com/indeks/"
]
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
return req
def kontene(self, response):
for mbuh in response.xpath('//head'):
Item = ProjeItem()
Item['title'] = mbuh.xpath('//title/text()').extract()
yield Item
according to the scrapy docs, parse needs to return an interable of Request, i.e. a list or a generator. Just change return to yield and it should work as expected:
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
yield req
The issue is that you have a return statement within your for loop. In Python, a return will return out of the function, giving you only the first links worth of content. Instead, consider adding req to a list of returned objects.
def parse(self, response):
req_list = []
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
req_list += req
return req_list

Scrapy + Splash + ScrapyJS

i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.

Scrapy: how to crawl the URL I got from spider? exceptions.NameError: global name 'parse_detail' is not defined

I practice scrapy and have a question:
I want to crawl the link I got from spider again and don't know how to do
Here is My code:
as you can see,the link I crawl will save in the parameter:movie_descriptionTW_URL
And I wrote yield Request(movie_descriptionTW, parse_detail) to send the result to def :
def parse_detail(self, response):
print(response.url)
But there is an error : exceptions.NameError: global name 'parse_detail' is not defined
How to solve this?
Please teach me! Thank you
from scrapy.spider import Spider
from scrapy.selector import Selector
from yahoo.items import YahooItem
from scrapy.http.request import Request
class MySpider(Spider):
name = "yahoogo"
start_urls = ["https://tw.movies.yahoo.com/chart.html"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//tr")
items = []
for site in sites:
item = YahooItem()
ranking_list = site.xpath("td[#class='c1']/span/text()").extract()
movie_descriptionTW = site.xpath("(td[#class='c3']/*//a)[position() < last()-1]/text() | td[#class='c3']/a[1]/text() ").extract()
movie_descriptionTW_URL = site.xpath("(td[#class='c3']/*//a[2]/#href) | td[#class='c3']/a[1]/#href ").extract()
# crawl again!
yield Request(movie_descriptionTW, parse_detail)
if ranking_list:
items.append(item)
yield items
def parse_detail(self, response):
print(response.url)
use self.parse_detail to refer to class methods like the following:
for url in movie_descriptionTW_URL:
yield Request(url=url, callback=self.parse_detail)

Categories