struggling to make an xpath work - python

I am starting at one page on a website and trying to find a link that takes me to another page to scrap. However, for reasons that i can't work out my xpath to the link on this first page to take me to the other page is not working. I have tried to get to the link from several directions, though none seem to work?
import scrapy
from urllib.parse import urljoin
from autotrader1.items import Autotrader1Item
class Autotrader1Spider(scrapy.Spider):
name = "autotrader1"
allowed_domains = ["autotrader.co.uk"]
start_urls = ["https://www.autotrader.co.uk/dealers/greater-manchester/manchester/liberty-cars-10000889"]
def parse(self,response):
for href1 in response.xpath('//*[#class="dealer__stock-header"]/a/#href'):
url1 = urljoin('https://autotrader.co.uk/',href1.extract())
yield scrapy.Request(url1, callback=self.parse_dir_contents1)
def parse_dir_contents1(self, response):
for sel in response.xpath('//div[#class="dealerStock"]'):
URLs = sel.xpath('.//a[#class="external title"]/#href ').extract()
URLs = [URL.strip() for URL in URLs]
descriptions = sel.xpath('.//a[#class="external title"]/text()').extract()
descriptions = [description.strip() for description in descriptions]
prices = sel.xpath('.//div/span[#class="deal-price"]/text()').extract()
prices = [price.strip() for price in prices]
result = zip(URLs, descriptions, prices)
for URL, description, price in result:
item = Autotrader1Item()
item['URL'] = URL
item['description'] = description
item['price'] = price
yield item

Related

Python + web scraping + scrapy : How to get the links to all movies from an IMDb page?

I have to scrape all movies from this IMDb page : https://www.imdb.com/list/ls055386972/.
My approach is first to scrape all the values of The Godfather', u'Schindler\'s List......]'I want only the "/title/tt0068646/?ref_=ttls_li_tt" portion.
How to proceed?
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.imdb.com/list/ls055386972/")
soup = BeautifulSoup(page.content, 'html.parser')
movies = soup.findAll('h3', attrs={'class' : 'lister-item-header'})
for movie in movies:
print(movie.a['href'])
OUTPUT:
/title/tt0068646/?ref_=ttls_li_tt
/title/tt0108052/?ref_=ttls_li_tt
/title/tt0050083/?ref_=ttls_li_tt
/title/tt0118799/?ref_=ttls_li_tt
.
.
.
.
/title/tt0088763/?ref_=ttls_li_tt
/title/tt0266543/?ref_=ttls_li_tt
I would suggest you to use requests-html to get all the hyperlinks and remove the ones that doesn't match your criteria. You can even get the absolute url using r.html.absolute_links
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.imdb.com/list/ls055386972/')
links = r.html.links
for i in range(len(links)):
if not links[i].startswith('/title/'):
del links[i]
print(links)
it is the working code please try:
class MoviesSpider():
name = 'movies' #name of the spider
allowed_domains = ['imdb.com']
start_url = 'http://imdb.com/list/ls055386972/'
def __init__(self):
super(MoviesSpider, self).__init__()
def start_requests(self):
yield Request(self.start_url, callback=self.parse, headers=self.headers)
def parse(self, response):
#events = response.xpath('//*[#property="url"]/#href').extract()
links = response.xpath('//h3[#class]/a/#href').extract()
final_links = []
for link in links:
final_link = 'http://www.imdb.com' + link
final_links.append(final_link)
for final_link in final_links:
absolute_url = response.urljoin(final_link)
yield Request(absolute_url, callback = self.parse_movies)
#process next page url
#next_page_url = response.xpath('//a[text() = "Next"]/#href').extract_first()
#absolute_next_page_url = response.urljoin(next_page_url)
#yield Request(absolute_next_page_url)
def parse_movies(self, response):
title = response.xpath('//div[#class = "title_wrapper"]/h1[#class]/text()').extract_first()
yield{
'title': title,
}

Scrapy Spider following urls, but wont export the data

I am trying to grab details from a real estate listing page. I can grab all the data, I just can't seem to export it..
Perhaps a problem with the way I use the yield keyword. The code work for the most part:
Visits page 1, example.com/kittens
Goes to page 2, example.com/puppers. Here are 10 apartments listed in blocks. I can get data from each block, but I need additional info from inside the hyperlink.
Visits the hyperlink, say, example.com/puppers/apartment1. It grabs some info from here as well, but I can't seem to return this data to include it in my HousingItem() class.
import scrapy
from urllib.parse import urljoin
class HousingItem(scrapy.Item):
street = scrapy.Field()
postal = scrapy.Field()
city = scrapy.Field()
url = scrapy.Field()
buildY = scrapy.Field()
on_m = scrapy.Field()
off_m = scrapy.Field()
class FAppSpider(scrapy.Spider):
name = 'f_app'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/kittens']
def parse(self, response):
yield scrapy.Request(url="https://www.example.com/puppers",
callback=self.parse_puppers)
def parse_inside_pupper(self, response):
item = HousingItem()
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
# Problem area from here..
yield response.follow(url=item['url'],callback=self.parse_inside_pupper)
# yield scrapy.request(url=item['url'],callback=self.parse_inside_pupper)?
yield item
FEED_EXPORT_FIELDS is adjusted in my SETTINGS.py. The 4 items from parse_puppers() get exported correctly, parse_inside_puppers() data is correct in the console, but wont export.
I use scrapy crawl f_app -o raw_data.csv to run me spider. Thanks in advance, appreciate all the help.
p.s. im fairly new to python and practising, i bet you noticed.
You need to send you current item to the parse_inside_pupper using meta param:
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
yield response.follow(url=item['url'],callback=self.parse_inside_pupper, meta={"item": item})
After that you can use it inside parse_inside_pupper (and yield it from here):
def parse_inside_pupper(self, response):
item = response.meta["item"]
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
yield item

Scrapy: Extracting data from source and its links

Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test

How to use scrapy to crawl data from multipages which are implemented by javascript

I want to use scrapy to crawl data from webpages, but the difference between different pages can't be seen from the url.For example:
http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery=man&submit=Feeling+Lucky
The url as above is the first page which I want to crawl data from, and it's easy to get data from it.
Here is my code:
__author__ = 'Rabbit'
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy_Data.items import EPGD
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
stmp = []
term = "man"
url_base = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery=man&submit=Feeling+Lucky"
start_urls = stmp
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for site in sites:
item = EPGD()
item['genID'] = map(unicode.strip, site.xpath('td[1]/a/text()').extract())
item['taxID'] = map(unicode.strip, site.xpath('td[2]/a/text()').extract())
item['familyID'] = map(unicode.strip, site.xpath('td[3]/a/text()').extract())
item['chromosome'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
item['symbol'] = map(unicode.strip, site.xpath('td[5]/text()').extract())
item['description'] = map(unicode.strip, site.xpath('td[6]/text()').extract())
yield item
But the problem comes out if I want to get data from page 2.I click next page, and the url of second page looks like this:
http://epgd.biosino.org/EPGD/search/textsearch.jsp?currentIndex=20
Just as you see, it doesn't have a keyword in its url, so I don't know how to get data from other pages. Maybe I should use cookies, but I don't know how to do with this situation, so can anyone help me.
Thanks a lot!
When link parsing and Request yielding is added to your parse() function, your example just works for me. Maybe the page uses some server-side cookies. But using a proxy service like Scrapy's Crawlera (which downloads from multiple IPs) it fails though.
The solution is to enter the 'textquery' parameter manually into the request url:
import urlparse
from urllib import urlencode
from scrapy import Request
from scrapy.spiders import Spider
from scrapy.selector import Selector
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
term = 'calb'
base_url = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?currentIndex=0&textquery=%s"
start_urls = [base_url % term]
def update_url(self, url, params):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))
query.update(params)
url_parts[4] = urlencode(query)
url = urlparse.urlunparse(url_parts)
return url
def parse(self, response):
sel = Selector(response)
genes = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for gene in genes:
item = {}
item['genID'] = map(unicode.strip, gene.xpath('td[1]/a/text()').extract())
# ...
yield item
urls = sel.xpath('//div[#id="nviRecords"]/span[#id="quickPage"]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
url = self.update_url(url, params={'textquery': self.term})
yield Request(url)
update_url() function details from Lukasz' solution:
Add params to given URL in Python

Scrape information from Scraped URL

I am new to scrapy and is currently learning how to scrape information from a list of scraped URL. I have been able to scrape information from a url by going thru the tutorial in scrapy website. However, i am facing problem scraping information from a list of url scraped from a url even after googling for solution online.
The scraper that i have written below is able to scrape from the first url. However, it is unsuccessful in scraping from a list of scraped URL. The problem starts at def parse_following_urls(self, response): whereby i am unable to scrape from the list of scraped URL
Can anyone help to solve this? Thank in advance.
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
yield item
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/"+ i for i in urll]
for url in urls:
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True)
yield request
request.meta['item'] = item
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I have re wrote the code after trying suggestions given and looking at the output. Below is the edited code. However, i got another error that states that Request url must be str or unicode, got %s:' % type(url).__name__). How do i convert the URL from list to a string?
I thought URL should be in string as it is in a For loop. I have added this as comment in the code below. Is there any way to solve this?
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
url = ["http://marketdata.set.or.th/mkt/"]+ sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True) #Request url must be str or unicode, got list: How to solve this?
request.meta['item'] = item
yield item
yield request
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I see what you are trying to do here, it's called - chaining requests.
What this means is that you want to keep yielding Requests and keep carrying your filled Item in the Request
s meta attribute.
For your case all you need to do is instead of yielding Item yield a Request with an item in it. Change your parse to:
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/" + i for i in urll]
for url in urls:
yield scrapy.Request(url,
callback=self.parse_following_urls,
meta={'item': item})
I try to change the inverse 5th line
item = response.meta['item']
to
item = SET()
then it works!
Actually I didn't realize your "meta"way very much,since I never use this to describe item.

Categories