Using Urllib with Scrapy for Pagination - python

Trying to Scrap Next Page with Scrapy, Python 3.5 using urlib python library
import datetime
import urllib.request
import urllib.error
import urllib.parse
import socket
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from properties.items import PropertiesItem
class BasicSpider(scrapy.Spider):
name = "manual"
allowed_domains = ["web"]
# Start on the first index page
start_urls = (
'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
)
def parse(self, response):
# Get the next index URLs and yield Requests
next_selector = response.xpath('//*[contains(#class,"next")]//#href')
for url in next_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url))
# Get item URLs and yield Requests
item_selector = response.xpath('//*[#itemprop="url"]/#href')
for url in item_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url), callback=self.parse_item)
def parse(self, response):
l = ItemLoader(item=PropertiesItem(), response=response)
l.add_xpath('title', '//*[#itemprop="name"]/text()')
return l.load_item()
Everything Works Just Fine Without Error, But Scrapy Fetching only First Page, but according to code it should fetch all Next Pages
Here is the Output
[{
"title": [
"bermondsey ec kennington drive acton seven rm",
.......
"mary conversion borders eastham with gas"
}]
// Only Page 0 Titles :(
Is anything Wrong with Request or Urllib Call Syntax ?
PS : Xpath working, Scrapy Shell 'URL'

Let's Start with Wrong Uses Of Python Packages
Using Request without importing it, Fix it by.
from scrapy import Request
Wrong use of urljoin class from urllib, first import it
from urllib.parse import urljoin
now use urljoin direct without calling urllib.parse.urljoin
change it on
yield Request(urllib.parse.urljoin(response.url, url))
yield Request(urllib.parse.urljoin(response.url, url), callback=self.parse_item)
Not calling parse_item
call it on
def parse(self, response): #replace parse to parse_item
PS : If this code, is from Learning Scrapy Book then here is complete git example for python3 Version
https://github.com/Rahulsharma0810/Scrapy-Pagination-URLJOIN-Example

You seem to have two parse functions. So you only have the second one since it overrides the first one.
Just rename the second one to parse_item like the rest of your code seems to indicate.

Related

Python web crawling using Scrapy unable to fill form

I am trying to crawl this site which requires me to fill the form with the postal code to reach stores info.
the website: https://www.aldi-sued.de/de/filialen.html
I have written following code but don't know whats wrong. Please help:
from __future__ import unicode_literals
import logging
import scrapy
from scrapy.loader import ItemLoader
from ..items import StoreItem
logger = logging.getLogger(__name__)
class StoreSpider(scrapy.Spider):
name = "aldib"
start_urls = ["https://www.aldi-sued.de/de/filialen.html"]
def parse(self, response):
yield scrapy.FormRequest(url="https://www.aldi-sued.de/de/filialen.html",
formdata={"search": "38644"},
callback=self.parse_stores)
def parse_stores(self, response):
for store in response.css('div.dealer-list > div.dealer-item-content'):
name = store.xpath("span.dealer-name > strong::text").extract()
sl = ItemLoader(item=StoreItem(), selector=store, response=response)
sl.add_value("Name", name)
yield sl.load_item()
I suspect about there are 2 forms in website that one for site search and one for store search and I unable to choose the which one on the first code. So I changed the request part with
def parse(self, response):
yield scrapy.FormRequest.from_response(response,
formid="storeSearchForm",
formdata={"search": "38644"},
callback=self.parse_stores)
At the end I still cannot to reach the name of the stores in that postal code.

Scrapy extracting from javascript element

I am trying to extract data from the following url by using scrapy.
import re
import json
import scrapy
import unicodedata
from collections import Counter
#command scrapy crawl <spidername> -o <outputfile>
class PostsSpider(scrapy.Spider):
name = "ljcmnt"
start_urls = ['https://asperger.livejournal.com/3084959.html']
def parse(self, response):
comments_ids = response.xpath("//*[#id='comments']/div[5]/div/#data-tid").extract()
print comments_ids
But since the element(data-tid) in java script I am not able to catch it.
Any help in getting the data-tid from the start-url.

How to crawl items from a list of links with Scrapy and Selenium

Can you help me please to correct this script: I have a list of links search results and I want to vist and crawl each one of these links.
But this script click just the first link and then my crawler stops.
Any help is appreciated
Code "Spider" :
from scrapy.contrib.spiders import CrawlSpider
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.support.select import Select
from time import sleep
import selenium.webdriver.support.ui as ui
from scrapy.xlib.pydispatch import dispatcher
from scrapy.http import HtmlResponse, TextResponse
from extraction.items import ProduitItem
from scrapy import log
class RunnerSpider(CrawlSpider):
name = 'products_d'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
sel = Selector(response)
self.driver.get(response.url)
recherche = self.driver.find_element_by_xpath('//*[#id="twotabsearchtextbox"]')
recherche.send_keys("A")
recherche.submit()
resultat = self.driver.find_element_by_xpath('//ul[#id="s-results-list-atf"]')
#Links
resultas = resultat.find_elements_by_xpath('//li/div[#class="s-item-container"]/div/div/div[2]/div[1]/a')
links = []
for lien in resultas:
l = lien.get_attribute('href')
links.append(l)
for result in links:
item = ProduitItem()
link = result
self.driver.get(link)
item['URL'] = link
item['Title'] = self.driver.find_element_by_xpath('//h1[#id="aiv-content-title"]').text
yield item
self.driver.close()
So there are a few issues with your script.
1) Your parse function overrides CrawlSpider's implementation of the same function. That means that CrawlSpider's default behaviour, which is in charge of extracting links from the page for continued crawling, is not being called. That's not recommended when using CrawlSpider. See here for details:
http://doc.scrapy.org/en/latest/topics/spiders.html
2) You don't yield any followup URLs yourself. You only yield Items. If you want Scrapy to keep processing URLs, you have to yield some form of Request object alongside your items.
3) You kill Selenium's driver at the end of the parse function. That will probably cause it to fail on a followup call anyway. There's no need to do that.
4) You're using Selenium & Scrapy's URL grabbing concurrently. That's not necessarily wrong, but keep in mind that it might result in some erratic behaviour.
5) Your script indentation is definitely off, that makes it difficult to look at your code.

Python, Scrapy, Selenium: how to attach webdriver to "response" passed into a function to use it for further action

I am trying to use Selenium to obtain value of selected option from a drop down list in a scrapy spider, but am unsure of how to go about it. Its my first interaction with Selenium.
As you can see in the code below, I create a request in parse function which calls parse_page function as a callback. In parse_page I want to extract the value of selected option. I cant figure out how to attach webdriver to the response page sent into parse_page to be able to use it in Select. I have written an obviously wrong code below :(
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.exceptions import CloseSpider
import logging
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy.http import FormRequest
from scrapy.http import Request
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from activityadvisor.items import TruYog
logging.basicConfig()
logger = logging.getLogger()
class TrueYoga(Spider):
name = "trueyoga"
allowed_domains = ["trueyoga.com.sg","trueclassbooking.com.sg"]
start_urls = [
"http://trueclassbooking.com.sg/frames/class-schedules.aspx",
]
def parse(self, response):
clubs=[]
clubs = Selector(response).xpath('//div[#class="club-selections"]/div/div/div/a/#rel').extract()
clubs.sort()
print 'length of clubs = ' , len(clubs), '1st content of clubs = ', clubs
req=[]
for club in clubs:
payload = {'ctl00$cphContents$ddlClub':club}
req.append(FormRequest.from_response(response,formdata = payload, dont_click=True, callback = self.parse_page))
for request in req:
yield request
def parse_page(self, response):
driver = webdriver.Firefox()
driver.get(response)
clubSelect = Select(driver.find_element_by_id("ctl00_cphContents_ddlClub"))
option = clubSelect.first_selected_option
print option.text
Is there any way to obtain this option value in scrapy without using Selenium? My search on google and stackoverflow didn't yield any useful answers so far.
Thanks for help!
I would recommend using Downloader Middleware to pass the Selenium response over to your spider's parse method. Take a look at the example I wrote as an answer to another question.
If you get the response there are the select boxes with their options. One of those options has the attribute selected="selected". I think you should go through this attribute to avoid the usage of Selenium:
def parse_page(self, response):
response.xpath("//select[#id='ctl00_cphContents_ddlClub']//option[#selected = 'selected']").extract()

Distinguishing between HTML and non-HTML pages in Scrapy

I am building a Spider in Scrapy that follows all the links it can find, and sends the url to a pipeline. At the moment, this is my code:
from scrapy import Spider
from scrapy.http import Request
from scrapy.http import TextResponse
from scrapy.selector import Selector
from scrapyTest.items import TestItem
import urlparse
class TestSpider(Spider):
name = 'TestSpider'
allowed_domains = ['pyzaist.com']
start_urls = ['http://pyzaist.com/drone']
def parse(self, response):
item = TestItem()
item["url"] = response.url
yield item
links = response.xpath("//a/#href").extract()
for link in links:
yield Request(urlparse.urljoin(response.url, link))
This does the job, but throws an error whenever the response is just a Response, not a TextResponse or HtmlResponse. This is because there is no Response.xpath(). I tried to test for this by doing:
if type(response) is TextResponse:
links = response.xpath("//a#href").extract()
...
But to no avail. When I do that, it never enters the if statement. I am new to Python, so it might be a language thing. I appreciate any help.
Nevermind, I found the answer. type() only gives information on the immediate type. It tells nothing of inheritance. I was looking for isinstance(). This code works:
if isinstance(response, TextResponse):
links = response.xpath("//a/#href").extract()
...
https://stackoverflow.com/a/2225066/1455074, near the bottom

Categories