import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)
Related
I tried to do a web scraper from Stackoverflow questions, but the 3rd column doesn't download the data, can you help me please?
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
class Question(Item):
a_id = Field()
b_question = Field()
c_desc = Field()
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
sel = Selector(response)
questions = sel.xpath('//div[#id="questions"]//div[#class="s-post-summary--content"]')
i = 1
for quest in questions:
item = ItemLoader(Question(), quest)
item.add_xpath('b_question', './/h3/a/text()')
item.add_xpath('c_desc', './/div[#class="s-post-summary--content-excerpt"]/text()')
item.add_value('a_id', i)
i = i+1
yield item.load_item()
picture from csv file output
picture from website and the html code
Try it like this: I added some inline notes to explain the changes
from scrapy.spiders import Spider
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
# iterate through each question as an xpath object.
for i, question in enumerate(response.xpath("//div[#class='s-post-summary--content']")):
# use get method to grab text
title = question.xpath('.//h3/a/text()').get()
content = question.xpath('.//div[#class="s-post-summary--content-excerpt"]/text()').get()
# yielding a regular dictionary in your case is the same thing
yield {
"b_question": title.strip(),
"c_desc": content.strip(),
"a_id": i
}
I am to scrape the table but they will provide me empty output theses is page link https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
details={}
key=response.xpath("//table//tbody/tr/td[1]/text()").get()
value=response.xpath("//table//tbody/tr/td[2]/text()").get()
details[key]=value
yield details
It was a bit hard to xpath selection correctly.Now it's working.
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = [
'https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important']
def parse(self, response):
details={}
key=response.xpath("//td[contains(.,'Source')]/text()").get()
value=response.xpath("//td[contains(.,'Source')]/following-sibling::td/text()").get()
details[key]=value
yield details
Output:
{'Source': 'Sigmoid sinus and Inferior petrosal sinus'}
I am to try to extract table they will give me the output but they will be wrong this is page link
https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details
Try this:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[#class="name"]/a/text()').get().strip()]
detail = column.xpath('td/#data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
The data-value saves the numerical values too, so we can use it too. Because I had problems extracting the text() as you had.
Finally, I think that you don't need to specify the table class name (table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']) because the page just has one table.
import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
def parse(self, response):
for link in response.xpath("//div[#class='exbox-name']/a/#href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[#class="expo-table general-color"]//tr')
table = {}
for row in rows:
key = row.xpath('.//td[1]//text()').get(default='').strip()
value = row.xpath('.//td[2]/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
I am trying to scrape table but they will not give the information of Telefono,Fax,Email,Membro di,Social check these
{'Indirizzo': 'Dr.-Auner-Str. 21a', 'Città ': 'Raaba / Graz', 'Nazionalità ': 'Austria', 'Sito web': '', 'Stand': 'Pad. 5 B22 C27', 'Telefono': '', 'Fax': '', 'E-mail': '', 'Social': ''}
the link of page is http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh
The values for telephone and fax etc are in an a tag therefore you need to adjust your xpath selectors to account for those cases.
See below sample
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
def parse(self, response):
for link in response.xpath("//div[#class='exbox-name']/a/#href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[#class="expo-table general-color"]/tr')
table = {}
for row in rows:
key = row.xpath('./td[1]//text()').get(default='').strip()
value = row.xpath('./td[2]/text() ').getall()
value = ''.join(value).strip()
if not value:
value = row.xpath('./td[2]/a/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D
The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})
You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request
I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})