Scraping interactive website

Scraping interactive website - python

I'm trying to scrap name of the course with number of students from Udacity to find out which courses are the most popular. I manage to create code for item:
import scrapy
class UdacityItem(scrapy.Item):
name=scrapy.Field()
users=scrapy.Field()
and spider:
import scrapy
from Udacity.items import UdacityItem
import re
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t=UdacityItem()
#name & url
t['name']=s.xpath('text()').extract()[0].strip()
url=response.urljoin(s.xpath('#href').extract()[0])
#request
req=scrapy.Request(url, callback=self.second)
req.meta['item']=t
#execute
yield req
def second(self,response):
t=response.meta['item']
strong =response.xpath('//strong[#data-course-student-count]/text()').extract()[0]
t['users']=strong
yield t
As a result I'm getting name of the course but instead of the number of students I am getting text 'thousands of'. When I open an example website in browser I see that 'thousands of' is the base value and later (in 1-2 sec) this text is changing into a proper number(which I want to get).
And here are my questions:
Why this replacement is happening? Is this JavaScript code? I would
like to understand mechanism of this change.
How I can capture proper number of students using scrapy? I hope this is possible.
Thank you in advance for help with that.

To get the enrollments count, you would have to simulate the API request to https://www.udacity.com/api/summaries endpoint for a specific course id, which can be extracted from the URL itself - for example, it is ud898 for the https://www.udacity.com/course/javascript-promises--ud898 URL.
Complete spider:
import json
import re
from urllib import quote_plus
import scrapy
class UdacityItem(scrapy.Item):
name = scrapy.Field()
users = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t = UdacityItem()
# name & url
t['name'] = s.xpath('text()').extract()[0].strip()
url = response.urljoin(s.xpath('#href').extract()[0])
# request
req = scrapy.Request(url, callback=self.second)
req.meta['item'] = t
# execute
yield req
def second(self, response):
queries = [{
"limit": 1,
"model": "CourseStudentsSummary",
"locator": {
"sample_frequency": "daily",
"content_context": [{
"node_key": re.search(r'--(.*?)$', response.url).group(1)
}]
}
}]
yield scrapy.Request(method="GET",
url="https://www.udacity.com/api/summaries?queries=" + quote_plus(json.dumps(queries)),
callback=self.parse_totals)
def parse_totals(self, response):
print(json.loads(response.body[5:].strip())["summaries"]["default"][0]["data"]["total_enrollments"])

Related

How to scrape data via scrapy python correctly from a dynamically(?) created table

I am currently trying to crawl the the Company Overview from alibaba.com.
For instance: https://www.alibaba.com/product-detail/T14-series-original-air-pro-TWS_1600273931389.html?spm=a2700.galleryofferlist.normal_offer.d_title.4aa778f2ahtuBx&s=p
For getting the information like company name I did:
response.xpath("//a[#class='company-name company-name-lite-vb']/text()").extract()
Which works fine.
When entering "Company Overview">"Company Profile" and than trying to crawl information from the table with:
response.xpath("//div/div[#class='content-value']").extract()
I get an empty array.
resources/search_results_searchpage.yml:
products:
css: 'div[data-content="productItem"]'
multiple: true
type: Text
children:
link:
css: a.elements-title-normal
type: Link
crawler.py:
import scrapy
import csv
#from scrapy_selenium import SeleniumRequest # only needed when using selenium
import os
from selectorlib import Extractor
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text="Headphones"
url="https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(search_text)
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(response.text, base_url=response.url)
for product in data['products']:
parsed_url=product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage)
#yield SeleniumRequest(url=parsed_url, callback=self.crawl_mainpage)
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
Anybody having an idea what I could do to populate Year of Est.?
I tried to use scrapy_selenium and configured it correctly, because I suspect that the object is generated dynamically but still no luck or I am possibly using it wrong
tun with:
scrapy crawl alibaba_crawler -o out.csv -t csv

Your xpath selector is not correct. Try this
'Year of Est.': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
I also note some errors in your code such as the line below which will raise an error. You may want to recheck how you extract links from the search page.
data = self.link_extractor.extract(response.text, base_url=response.url)
Edit:
The year of establishment is loaded once the company tab is clicked. You have to simulate the click using selenium or scrapy-playwright. My simple implementation using scrapy-playwright is as below.
import scrapy
from scrapy.crawler import CrawlerProcess
import os
from selectorlib import Extractor
from scrapy_playwright.page import PageCoroutine
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text = "Headphones"
url = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(
search_text)
yield scrapy.Request(url, callback=self.parse, meta={"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(
response.text, base_url=response.url)
for product in data['products']:
parsed_url = product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage, meta={"playwright": True, 'playwright_page_coroutines': {
"click": PageCoroutine("click", selector="//span[#title='Company Profile']"),
},})
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
if __name__ == "__main__":
process = CrawlerProcess(settings={
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR' :"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
})
process.crawl(Spider)
process.start()
Below is a sample log of running the scraper using python crawler.py. The year 2010 is shown in the output

Scrapy - Scrape both text and images in the same spider

Scrapy super noob here. Problem: I have an html page that contains both information that I want to scrape and an url that I want to follow to get images urls for images that I want to download and save via the scrapy image pipeline.
My approach to achieve this:
1. Scrape all the details as usual with a parse method
2. Find the url in the initial page, create a request that has a second parse method as callback where I build the image_urls list.
So, I have the following setup:
settings.py
...
ITEM_PIPELINES = {
'crawlbot.pipelines.MybotPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = '/url/to/images' #valid path to actual folder
...
pipelines.py
import pymongo
class MybotPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient('localhost', 27017)
db = self.conn['libraries']
self.collection = db['books']
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
items.py
import scrapy
class MybotItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
description = scrapy.Field()
crawler.py
import scrapy
from scrapy.spiders import CrawlSpider
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['books.com']
def start_requests(self):
urls = [
'https://www.books.com/some/url'
]
custom_settings = {
'DEPTH_LIMIT': 1
}
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_details(self, response):
for image in enumerate(response.xpath('//div[contains(#class, "jumbotron")]/div')):
image_urls = image.xpath('div[contains(#class, "jumbotron-image")]/img/#src').getall()
def parse_item(self, response):
for idx, list_item in enumerate(response.xpath('//div[contains(#class, "slider-wrapper")]')):
anchor = list_item.xpath('div[contains(#class, "slider-section")]/div/a')
slider_thumbnail = anchor.xpath('div[contains(#class, "slider-thumbnail")]')
description = slider_thumbnail.xpath('div[contains(#class, "description-box")]')
yield {
'url': anchor.xpath('#href').get(),
'description': description
}
details_page_urls = anchor.xpath('#href').getall()
for details_page in details_page_urls:
yield scrapy.Request(url=details_page, callback=self.parse_details)
This is not working, although with my little knowledge of both Scrapy and Python, the second parse method should return a list of image_urls. So I have 2 questions: 1. is there a better approach for my case? Maybe the whole issue is in trying to do too much with one spider? 2. If the approach is ok, what am I doing wrong?

How to crawl data from the linked webpages on a webpage we are crawling

I am crawling the names of the colleges on this webpage, but, i also want to crawl the number of faculties in these colleges which is available if open the specific webpages of the colleges by clicking the name of the college.
What should i append to this code to get the result.
The result should be in the form of [(name1, faculty1), (name2,faculty2),... ]
import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('li.search-result'):
yield {
'name': students.css('div.title a::text').extract(),
}

import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('li.search-result'):
req = scrapy.Request(students.css(SELECT_URL), callback=self.parse_student)
req.meta['name'] = students.css('div.title a::text').extract()
yield req
def parse_student(self, response):
yield {
'name': response.meta.get('name')
'other data': response.css(SELECTOR)
}
Should be something like this.
So you send the name of the student in the meta data of the request.
That allows you to request it in your next request.
If the data is also available on the last page you scrape in parse_student you might want to consider not sending it in the meta data but just to scrape it from the last page.

Scrapy neither shows any error nor fetches any data

Tried to parse product name and price from a site using scrapy. However, When i run my scrapy code it neither shows any error nor fetches any data. What I'm doing wrong is beyond my capability to find out. Hope there is someone to take a look into it.
"items.py" includes:
import scrapy
class SephoraItem(scrapy.Item):
Name = scrapy.Field()
Price = scrapy.Field()
spider file named "sephorasp.py" contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class SephoraspSpider(CrawlSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = ["https://www.sephora.ae/en/stores/"]
rules = [
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
callback="parse_item")
]
def parse_item(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product = titles.xpath('.//a[#title]/text()').extract()
Rate = titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Here is the Link to the Log:
"https://www.dropbox.com/s/8xktgh7lvj4uhbh/output.log?dl=0"
It works when I play around with BaseSpider:
from scrapy.spider import BaseSpider
from scrapy.http.request import Request
class SephoraspSpider(BaseSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = [
"https://www.sephora.ae/en/travel-size/make-up",
"https://www.sephora.ae/en/perfume/women-perfume",
"https://www.sephora.ae/en/makeup/eye/eyeshadow",
"https://www.sephora.ae/en/skincare/moisturizers",
"https://www.sephora.ae/en/gifts/palettes"
]
def pro(self, response):
item_links = response.xpath('//a[contains(#class,"level0")]/#href').extract()
for a in item_links:
yield Request(a, callback = self.end)
def end(self, response):
item_link = response.xpath('//a[#class="level2"]/#href').extract()
for b in item_link:
yield Request(b, callback = self.parse)
def parse(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product= titles.xpath('.//a[#title]/text()').extract()
Rate= titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}

Your xpaths are heavily flawed.
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
You are matching whole class ranges which can change at any point and the order might be different in scrapy. Just pick one class, it's most likely unique enough:
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level0")]')),
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level2")]')),

How to use scrapy to crawl data from multipages which are implemented by javascript

I want to use scrapy to crawl data from webpages, but the difference between different pages can't be seen from the url.For example:
http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery=man&submit=Feeling+Lucky
The url as above is the first page which I want to crawl data from, and it's easy to get data from it.
Here is my code:
__author__ = 'Rabbit'
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy_Data.items import EPGD
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
stmp = []
term = "man"
url_base = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery=man&submit=Feeling+Lucky"
start_urls = stmp
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for site in sites:
item = EPGD()
item['genID'] = map(unicode.strip, site.xpath('td[1]/a/text()').extract())
item['taxID'] = map(unicode.strip, site.xpath('td[2]/a/text()').extract())
item['familyID'] = map(unicode.strip, site.xpath('td[3]/a/text()').extract())
item['chromosome'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
item['symbol'] = map(unicode.strip, site.xpath('td[5]/text()').extract())
item['description'] = map(unicode.strip, site.xpath('td[6]/text()').extract())
yield item
But the problem comes out if I want to get data from page 2.I click next page, and the url of second page looks like this:
http://epgd.biosino.org/EPGD/search/textsearch.jsp?currentIndex=20
Just as you see, it doesn't have a keyword in its url, so I don't know how to get data from other pages. Maybe I should use cookies, but I don't know how to do with this situation, so can anyone help me.
Thanks a lot!

When link parsing and Request yielding is added to your parse() function, your example just works for me. Maybe the page uses some server-side cookies. But using a proxy service like Scrapy's Crawlera (which downloads from multiple IPs) it fails though.
The solution is to enter the 'textquery' parameter manually into the request url:
import urlparse
from urllib import urlencode
from scrapy import Request
from scrapy.spiders import Spider
from scrapy.selector import Selector
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
term = 'calb'
base_url = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?currentIndex=0&textquery=%s"
start_urls = [base_url % term]
def update_url(self, url, params):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))
query.update(params)
url_parts[4] = urlencode(query)
url = urlparse.urlunparse(url_parts)
return url
def parse(self, response):
sel = Selector(response)
genes = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for gene in genes:
item = {}
item['genID'] = map(unicode.strip, gene.xpath('td[1]/a/text()').extract())
# ...
yield item
urls = sel.xpath('//div[#id="nviRecords"]/span[#id="quickPage"]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
url = self.update_url(url, params={'textquery': self.term})
yield Request(url)
update_url() function details from Lukasz' solution:
Add params to given URL in Python

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping interactive website - python

Related

How to scrape data via scrapy python correctly from a dynamically(?) created table

Scrapy - Scrape both text and images in the same spider

How to crawl data from the linked webpages on a webpage we are crawling

Scrapy neither shows any error nor fetches any data

How to use scrapy to crawl data from multipages which are implemented by javascript

Categories

Resources