Trying t scrape table provide empty output - python

I am to scrape the table but they will provide me empty output theses is page link https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
details={}
key=response.xpath("//table//tbody/tr/td[1]/text()").get()
value=response.xpath("//table//tbody/tr/td[2]/text()").get()
details[key]=value
yield details

It was a bit hard to xpath selection correctly.Now it's working.
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = [
'https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important']
def parse(self, response):
details={}
key=response.xpath("//td[contains(.,'Source')]/text()").get()
value=response.xpath("//td[contains(.,'Source')]/following-sibling::td/text()").get()
details[key]=value
yield details
Output:
{'Source': 'Sigmoid sinus and Inferior petrosal sinus'}

Related

Web scraper doesn't work correctly - field does not show any data

I tried to do a web scraper from Stackoverflow questions, but the 3rd column doesn't download the data, can you help me please?
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
class Question(Item):
a_id = Field()
b_question = Field()
c_desc = Field()
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
sel = Selector(response)
questions = sel.xpath('//div[#id="questions"]//div[#class="s-post-summary--content"]')
i = 1
for quest in questions:
item = ItemLoader(Question(), quest)
item.add_xpath('b_question', './/h3/a/text()')
item.add_xpath('c_desc', './/div[#class="s-post-summary--content-excerpt"]/text()')
item.add_value('a_id', i)
i = i+1
yield item.load_item()
picture from csv file output
picture from website and the html code
Try it like this: I added some inline notes to explain the changes
from scrapy.spiders import Spider
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
# iterate through each question as an xpath object.
for i, question in enumerate(response.xpath("//div[#class='s-post-summary--content']")):
# use get method to grab text
title = question.xpath('.//h3/a/text()').get()
content = question.xpath('.//div[#class="s-post-summary--content-excerpt"]/text()').get()
# yielding a regular dictionary in your case is the same thing
yield {
"b_question": title.strip(),
"c_desc": content.strip(),
"a_id": i
}

Create Xpath using scrapy

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)

Extract website but they provide wrong output

I am to try to extract table they will give me the output but they will be wrong this is page link
https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details
Try this:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[#class="name"]/a/text()').get().strip()]
detail = column.xpath('td/#data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
The data-value saves the numerical values too, so we can use it too. Because I had problems extracting the text() as you had.
Finally, I think that you don't need to specify the table class name (table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']) because the page just has one table.

Scrapy script getting wrong data

I wrote a scrapy script which getting some correct data and after a while it start getting wrong data like price column getting wrong price and img_url column getting half correct URL and half incorrect, As Im new to scrapy so please guide me how can I handle this issue. thanks!
here is my code:
# -*- coding: utf-8 -*-
import scrapy
class CurtainSpider(scrapy.Spider):
name = 'curtain'
allowed_domains = ['https://www.redbubble.com/shop']
#start_urls = ['https://www.redbubble.com/shop/shower-curtains/']
def start_requests(self):
yield scrapy.Request(url='https://www.redbubble.com/shop/shower-curtains/', callback=self.parse, headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
})
def parse(self, response):
products = response.xpath("//div[#class='styles__grid--197Ps']/a")
for product in products:
title = product.xpath(".//div[#class='styles__box--206r9 styles__paddingRight-0--fzRHs']/div[#class='styles__textContainer--1xehi styles__disableLineHeight--3n9Fg styles__nowrap--2Vk3A']/span/text()").get()
price = product.xpath(".//span[#class='styles__text--NLf2i styles__body--3bpp7 styles__block--3OueG']/strong/span/text()").get()
img_url = product.xpath(".//div[#class='styles__imageDiv--1zOnW']/div[#class='styles__box--206r9']/div[#class='styles__box--206r9 styles__ratioOuter--AlSwh styles__cover--zzlOp styles__square--3vP1e styles__rounded--1lyoH']/div[#class='styles__box--206r9 styles__ratioInner--KvIFM']/img/#src").get()
yield {
'Title' : title,
'Price' : price,
'Img_Url' : img_url,
'User-Agent' : response.request.headers['User-Agent']
}

How to send cookie with scrapy CrawlSpider requests?

I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D
The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})
You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request
I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})

Categories