Scraping/crawling multiple pages - python

Up to now I have found how to scrape one page or multiple pages with same url, but changing number. However, I could not find how to scrape pages with subcategories and their subcategories and finally get the content needed.
I am trying to scrape this website: http://www.askislam.org/index.html
I am using Scrapy, but I do not know where to start.
Or you can suggest a better option, I just use python and check from there.
Thanks
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy import Selector
from ask_islam.items import AskIslamItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
class AskislamSpider(Spider):
name = "askislam"
allowed_domains = ["askislam.org"]
start_urls = ['http://www.askislam.org/']
rules = [Rule(LinkExtractor(allow = ()), callback = 'parse', follow=True)]
def parse(self, response):
hxs = Selector(response)
links = hxs.css('div[id="categories"] li a::attr(href)').extract()
for link in links:
url = 'http://www.askislam.org' + link.replace('index.html', '')
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('div[id="categories"] li').extract()
questions = hxs.xpath('a').extract()
if(categories):
for categoryLink in categories:
url = 'http://www.askislam.org' + categoryLink.replace('index.html', '')
yield Request(url, callback=self.parse_page)
# print (question)
EDIT
def start_requests(self):
yield Request("http://www.askislam.org", callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('#categories li')
for cat in categories:
item = AskIslamItem()
link = cat.css('a::attr(href)').extract()[0]
link = "http://www.askislam.org" + link
item['catLink'] = link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
yield Request(link, callback=self.parse_categories)
def parse_categories(self, response):
logging.info("The Cat Url")

Read links from that http://www.askislam.org/index.html page using xPath or CSS Selectors of those sub-categories and then do another Request()
EDIT:
import logging
class AskislamSpider(Spider):
name = "askislam"
def start_requests(self):
yield Request("http://www.askislam.org/", callback=self.parse_page)
def parse_page(self, response):
categories = response.css('#categories li').extract()
for cat in categories:
link = cat.css("a::attr(href)").extract()[0]
link = "http://www.askislam.org/" + link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)

Related

Scrapy Pagination Fails on Multiple Listing

I'm trying to scrape a website using scrapy.
When I scrape a specific page, pagination scraping works but when I try to scrape all the pages with one jump pagination does not work.
I tried creating an extra function for the pagination but this does not fix the problem. All help would be appreciated. What am I doing wrong ? Here's my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.http import Request
from avtogumi.items import AvtogumiItem
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/index.php' ]
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_params)
def parse_params(self, response):
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse_params)
The issue here is this:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
This snippet of code will parse and load exactly one result. If you have a page with multiple results, you would have to put this code inside a for loop and iterate over all the search results you want to parse:
objects = response.xpath('my_selector_here')
for object in objects:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
Hope this helps
use/rewrite this code
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/']
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
yield Request(url=response.urljoin(url), callback=self.parse_params)
def parse_params(self, response):
subjects = response.xpath('//div[#class="full-product-box search-box"]')
for subject in subjects:
yield {
'title': subject.xpath('.//h4/a/text()').extract_first(),
'subtitle': subject.xpath('.//p[#class="ft-darkgray"]/text()').extract_first(),
'price': subject.xpath('.//span[#class="promo-price"]/text()').extract_first(),
'stock': subject.xpath('.//div[#class="product-box-stock"]//span/text()').extract_first(),
'category': subject.xpath('.//div[#class="labels hidden-md hidden-lg"][0]//text()').extract_first(),
'brand': subject.xpath('.//h4[#class="brand-header"][0]//text()').extract_first(),
'img_path': subject.xpath('.//div/img[#class="prod-imglist"]/#src').extract_first(),
}
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url, callback=self.parse_params)
13407 items scraped

scrapy not following links

The following scrapy code for returning medical treatment information does return the first set of returned results, but does not follow links. Learning code and checked similar results here on stackoverflow, but integrating them did not work. True, I'm learning. Any pointers would be appreciated.
import urlparse
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import w3lib.url
from yelp.items import YelpItem
class YelpSpider(BaseSpider):
name = "yelp"
download_delay = 10
concurrent_requests = 1
concurrent_requests_per_domain = 1
allowed_domains = ["yelp.com"]
start_urls = ["http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=0",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=20",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=30"]
def parse(self, response):
selector = Selector(response)
for title in selector.css("span.indexed-biz-name"):
page_url = urlparse.urljoin(response.url,
title.xpath("a/#href").extract()[0])
self.log("page URL: %s" % page_url)
#continue
yield Request(page_url,
callback=self.parse_page)
for next_page in selector.css(u'ul > li > a.prev-next:contains(\u2192)'):
next_url = urlparse.urljoin(response.url,
next_page.xpath('#href').extract()[0])
self.log("next URL: %s" % next_url)
#continue
yield Request(next_url,
callback=self.parse)
def parse_page(self, response):
selector = Selector(response)
item = YelpItem()
item["name"] = selector.xpath('.//h1[#itemprop="name"]/text()').extract()[0].strip()
item["addresslocality"] = u"\n".join(
selector.xpath('.//address[#itemprop="address"]//text()').extract()).strip()
item["link"] = response.url
website = selector.css('div.biz-website a')
if website:
website_url = website.xpath('#href').extract()[0]
item["website"] = w3lib.url.url_query_parameter(website_url, "url")
return item
Your next URL extraction and selection logic is not correct. Target the link element having next and pagination-links_anchor classes. The following works for me:
next_url = response.css('a.pagination-links_anchor.next::attr(href)').extract_first()
if next_url:
next_url = urlparse.urljoin(response.url, next_url)
self.log("next URL: %s" % next_url)
yield Request(next_url, callback=self.parse)

Scrapy and Splash don't crawl

I made a crawler, splash is working (i tested it in my browser), scrapy though can't crawl and extract items.
My actual code is:
# -*- coding: utf-8 -*-
import scrapy
import json
from scrapy.http.headers import Headers
from scrapy.spiders import CrawlSpider, Rule
from oddsportal.items import OddsportalItem
class OddbotSpider(CrawlSpider):
name = "oddbot"
allowed_domains = ["oddsportal.com"]
start_urls = (
'http://www.oddsportal.com/matches/tennis/',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 5.5}
}
})
def parse(self, response):
item = OddsportalItem()
print response.body
Try importing scrap_splash and call new request through SplashRequest as:
from scrapy_splash import SplashRequest
yield SplashRequest(url, endpoint='render.html', args={'any':any})
You should modify CrawlSpider
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)

Scrapy: how to crawl the URL I got from spider? exceptions.NameError: global name 'parse_detail' is not defined

I practice scrapy and have a question:
I want to crawl the link I got from spider again and don't know how to do
Here is My code:
as you can see,the link I crawl will save in the parameter:movie_descriptionTW_URL
And I wrote yield Request(movie_descriptionTW, parse_detail) to send the result to def :
def parse_detail(self, response):
print(response.url)
But there is an error : exceptions.NameError: global name 'parse_detail' is not defined
How to solve this?
Please teach me! Thank you
from scrapy.spider import Spider
from scrapy.selector import Selector
from yahoo.items import YahooItem
from scrapy.http.request import Request
class MySpider(Spider):
name = "yahoogo"
start_urls = ["https://tw.movies.yahoo.com/chart.html"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//tr")
items = []
for site in sites:
item = YahooItem()
ranking_list = site.xpath("td[#class='c1']/span/text()").extract()
movie_descriptionTW = site.xpath("(td[#class='c3']/*//a)[position() < last()-1]/text() | td[#class='c3']/a[1]/text() ").extract()
movie_descriptionTW_URL = site.xpath("(td[#class='c3']/*//a[2]/#href) | td[#class='c3']/a[1]/#href ").extract()
# crawl again!
yield Request(movie_descriptionTW, parse_detail)
if ranking_list:
items.append(item)
yield items
def parse_detail(self, response):
print(response.url)
use self.parse_detail to refer to class methods like the following:
for url in movie_descriptionTW_URL:
yield Request(url=url, callback=self.parse_detail)

Why scrapy is not going to next page and only getting first page items?

Earlier I had one rule also i.e.
if domains in departments.keys():rules = (Rule(SgmlLinkExtractor(allow=("?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=d_d" %(keyword,departments.get(domains)),),restrict_xpaths=('//li[#class="btn-nextResults"]'),),callback='parse',follow=True),),
but I removed it as it was calling parse method which is not recommended.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domains = ['All Departments']
keyword = 'Laptop'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
def start_requests(self):
for domain in self.domains:
if domain in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domain))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
next_link = hxs.select('//li[#class="btn-nextResults"]/#href').extract()
if next_link:
yield Request('http://www.walmart.com/search/search-ng.do' + next_link, self.parse)
else:
print "last Page"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/text()').extract()
items.append(walmart)
return items
I think you're simply missing an "/a" step in your XPath for next page links:
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
#
# here
# |
# v
next_link = hxs.select('//li[#class="btn-nextResults"]/a/#href').extract()
if next_link:
# and as hxs.select() will return a list, you should select the first element
yield Request('http://www.walmart.com/search/search-ng.do' + next_link[0], self.parse)
else:
print "last Page"

Categories