Extract website but they provide wrong output - python

I am to try to extract table they will give me the output but they will be wrong this is page link
https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details

Try this:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[#class="name"]/a/text()').get().strip()]
detail = column.xpath('td/#data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
The data-value saves the numerical values too, so we can use it too. Because I had problems extracting the text() as you had.
Finally, I think that you don't need to specify the table class name (table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']) because the page just has one table.

Related

How to separate data per column when writing data to excel from web scraping results

I know how to separate it when the data looks like:
x, y, z
But I can't figure out how to do it when the data format is like:
Doe, John, BookName, Year, abstract with commas, links.
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().split('\n'))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('\n'))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

Web scraper doesn't work correctly - field does not show any data

I tried to do a web scraper from Stackoverflow questions, but the 3rd column doesn't download the data, can you help me please?
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
class Question(Item):
a_id = Field()
b_question = Field()
c_desc = Field()
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
sel = Selector(response)
questions = sel.xpath('//div[#id="questions"]//div[#class="s-post-summary--content"]')
i = 1
for quest in questions:
item = ItemLoader(Question(), quest)
item.add_xpath('b_question', './/h3/a/text()')
item.add_xpath('c_desc', './/div[#class="s-post-summary--content-excerpt"]/text()')
item.add_value('a_id', i)
i = i+1
yield item.load_item()
picture from csv file output
picture from website and the html code
Try it like this: I added some inline notes to explain the changes
from scrapy.spiders import Spider
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
# iterate through each question as an xpath object.
for i, question in enumerate(response.xpath("//div[#class='s-post-summary--content']")):
# use get method to grab text
title = question.xpath('.//h3/a/text()').get()
content = question.xpath('.//div[#class="s-post-summary--content-excerpt"]/text()').get()
# yielding a regular dictionary in your case is the same thing
yield {
"b_question": title.strip(),
"c_desc": content.strip(),
"a_id": i
}

Create Xpath using scrapy

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)

Trying t scrape table provide empty output

I am to scrape the table but they will provide me empty output theses is page link https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
details={}
key=response.xpath("//table//tbody/tr/td[1]/text()").get()
value=response.xpath("//table//tbody/tr/td[2]/text()").get()
details[key]=value
yield details
It was a bit hard to xpath selection correctly.Now it's working.
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = [
'https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important']
def parse(self, response):
details={}
key=response.xpath("//td[contains(.,'Source')]/text()").get()
value=response.xpath("//td[contains(.,'Source')]/following-sibling::td/text()").get()
details[key]=value
yield details
Output:
{'Source': 'Sigmoid sinus and Inferior petrosal sinus'}

I am trying to index a request.css inside of my webscraper in python. But it does not work

I am using scrapy when webscraping in Python and I am writing here because I want to figure out how I can for-loop through all values in a smart way.
I am trying with the logic below but seems to not work as I want it to...
products = category.css("div.offer-category__body > div.offer-category__item")
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
Do you have any good tips of how I should loop through the elements and pick up all the values I want?
I am trying to webscrape this website:
https://www.ica.se/butiker/nara/bjurholm/ica-nara-westmans-livs-231/butikserbjudanden/
But if you want to try the full code then you will need to insert this URL: https://www.ica.se/butiker/ into the a notebook and call the notebook "URLs to be scrapped.txt"
The full code is attached below:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_splash import SplashRequest
import csv
with open("URLs To be Scrapped.txt") as f:
URLs = f.readlines()
class Playbook(scrapy.Spider):
name = "PostcodesSpider"
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'DataFinal.csv',
}
script = """
function main(splash, args)
splash.private_mode_enabled = false
splash:go(args.url)
splash:wait(2)
splash:set_viewport_full()
return splash:html()
end
"""
def start_requests(self):
for url in URLs:
yield SplashRequest(url=url.strip(),
callback=self.parse, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
endpoint="execute", args={"lua_source": self.script, }
)
def parse(self, response):
stores = response.css("div.store-card-content > div:nth-of-type(2) > a:last-of-type::attr(href)").extract()
for store in stores:
yield scrapy.Request(url="https://www.ica.se/" + store.strip(),
callback=self.parse2, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}, )
def parse2(self, response):
storeName = response.css("h1.store-heading::text").extract_first()
categories = response.css("section.offer-category")
for category in categories:
categoryName = category.css("header.offer-category__header::text").extract_first()
products = category.css("div.offer-category__body > div.offer-category__item")
print("test")
lengthproduct=len(products)
print(lengthproduct)
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
yield {
"Store": storeName.strip(),
"Category": categoryName.strip(),
"Image": img,
}
process = CrawlerProcess()
process.crawl(Playbook)
process.start()
If I understand your code correctly, you're trying to loop over an integer with the line for i in lengthproduct: which should not work, since for loops require iterables. To write a for loop that would return values from 0 to lengthproduct you can use the range function.
However, in this case I think you just want to iterate over every found product.
You can do that as follows:
for product in products:
img = product.css("div.offer-type__image > img::attr(data-original)").get()

Categories