Scrapy script getting wrong data - python

I wrote a scrapy script which getting some correct data and after a while it start getting wrong data like price column getting wrong price and img_url column getting half correct URL and half incorrect, As Im new to scrapy so please guide me how can I handle this issue. thanks!
here is my code:
# -*- coding: utf-8 -*-
import scrapy
class CurtainSpider(scrapy.Spider):
name = 'curtain'
allowed_domains = ['https://www.redbubble.com/shop']
#start_urls = ['https://www.redbubble.com/shop/shower-curtains/']
def start_requests(self):
yield scrapy.Request(url='https://www.redbubble.com/shop/shower-curtains/', callback=self.parse, headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
})
def parse(self, response):
products = response.xpath("//div[#class='styles__grid--197Ps']/a")
for product in products:
title = product.xpath(".//div[#class='styles__box--206r9 styles__paddingRight-0--fzRHs']/div[#class='styles__textContainer--1xehi styles__disableLineHeight--3n9Fg styles__nowrap--2Vk3A']/span/text()").get()
price = product.xpath(".//span[#class='styles__text--NLf2i styles__body--3bpp7 styles__block--3OueG']/strong/span/text()").get()
img_url = product.xpath(".//div[#class='styles__imageDiv--1zOnW']/div[#class='styles__box--206r9']/div[#class='styles__box--206r9 styles__ratioOuter--AlSwh styles__cover--zzlOp styles__square--3vP1e styles__rounded--1lyoH']/div[#class='styles__box--206r9 styles__ratioInner--KvIFM']/img/#src").get()
yield {
'Title' : title,
'Price' : price,
'Img_Url' : img_url,
'User-Agent' : response.request.headers['User-Agent']
}

Related

Web scraper doesn't work correctly - field does not show any data

I tried to do a web scraper from Stackoverflow questions, but the 3rd column doesn't download the data, can you help me please?
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
class Question(Item):
a_id = Field()
b_question = Field()
c_desc = Field()
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
sel = Selector(response)
questions = sel.xpath('//div[#id="questions"]//div[#class="s-post-summary--content"]')
i = 1
for quest in questions:
item = ItemLoader(Question(), quest)
item.add_xpath('b_question', './/h3/a/text()')
item.add_xpath('c_desc', './/div[#class="s-post-summary--content-excerpt"]/text()')
item.add_value('a_id', i)
i = i+1
yield item.load_item()
picture from csv file output
picture from website and the html code
Try it like this: I added some inline notes to explain the changes
from scrapy.spiders import Spider
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
# iterate through each question as an xpath object.
for i, question in enumerate(response.xpath("//div[#class='s-post-summary--content']")):
# use get method to grab text
title = question.xpath('.//h3/a/text()').get()
content = question.xpath('.//div[#class="s-post-summary--content-excerpt"]/text()').get()
# yielding a regular dictionary in your case is the same thing
yield {
"b_question": title.strip(),
"c_desc": content.strip(),
"a_id": i
}

Trying t scrape table provide empty output

I am to scrape the table but they will provide me empty output theses is page link https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
details={}
key=response.xpath("//table//tbody/tr/td[1]/text()").get()
value=response.xpath("//table//tbody/tr/td[2]/text()").get()
details[key]=value
yield details
It was a bit hard to xpath selection correctly.Now it's working.
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = [
'https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important']
def parse(self, response):
details={}
key=response.xpath("//td[contains(.,'Source')]/text()").get()
value=response.xpath("//td[contains(.,'Source')]/following-sibling::td/text()").get()
details[key]=value
yield details
Output:
{'Source': 'Sigmoid sinus and Inferior petrosal sinus'}

Extract website but they provide wrong output

I am to try to extract table they will give me the output but they will be wrong this is page link
https://hoopshype.com/salaries/players/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//thead//tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if i]
columns=response.xpath("//table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']//tbody//tr")
for column in columns:
players=column.xpath('td//text()').getall()
players = ''.join(players).strip()
details = dict(zip(keys, players))
yield details
Try this:
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://hoopshype.com/salaries/players/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
rows = response.xpath("//table/thead/tr")
keys = rows.xpath(".//td/text()").getall()
keys = [i.strip() for i in keys]
keys = [i for i in keys if I]
columns=response.xpath("//table/tbody/tr")
for column in columns:
player_name = [column.xpath('td[#class="name"]/a/text()').get().strip()]
detail = column.xpath('td/#data-value').getall()
details = dict(zip(keys, player_name+detail))
yield details
The data-value saves the numerical values too, so we can use it too. Because I had problems extracting the text() as you had.
Finally, I think that you don't need to specify the table class name (table[#class='hh-salaries-ranking-table hh-salaries-table-sortable responsive']) because the page just has one table.

I am trying to index a request.css inside of my webscraper in python. But it does not work

I am using scrapy when webscraping in Python and I am writing here because I want to figure out how I can for-loop through all values in a smart way.
I am trying with the logic below but seems to not work as I want it to...
products = category.css("div.offer-category__body > div.offer-category__item")
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
Do you have any good tips of how I should loop through the elements and pick up all the values I want?
I am trying to webscrape this website:
https://www.ica.se/butiker/nara/bjurholm/ica-nara-westmans-livs-231/butikserbjudanden/
But if you want to try the full code then you will need to insert this URL: https://www.ica.se/butiker/ into the a notebook and call the notebook "URLs to be scrapped.txt"
The full code is attached below:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_splash import SplashRequest
import csv
with open("URLs To be Scrapped.txt") as f:
URLs = f.readlines()
class Playbook(scrapy.Spider):
name = "PostcodesSpider"
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'DataFinal.csv',
}
script = """
function main(splash, args)
splash.private_mode_enabled = false
splash:go(args.url)
splash:wait(2)
splash:set_viewport_full()
return splash:html()
end
"""
def start_requests(self):
for url in URLs:
yield SplashRequest(url=url.strip(),
callback=self.parse, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
endpoint="execute", args={"lua_source": self.script, }
)
def parse(self, response):
stores = response.css("div.store-card-content > div:nth-of-type(2) > a:last-of-type::attr(href)").extract()
for store in stores:
yield scrapy.Request(url="https://www.ica.se/" + store.strip(),
callback=self.parse2, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}, )
def parse2(self, response):
storeName = response.css("h1.store-heading::text").extract_first()
categories = response.css("section.offer-category")
for category in categories:
categoryName = category.css("header.offer-category__header::text").extract_first()
products = category.css("div.offer-category__body > div.offer-category__item")
print("test")
lengthproduct=len(products)
print(lengthproduct)
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
yield {
"Store": storeName.strip(),
"Category": categoryName.strip(),
"Image": img,
}
process = CrawlerProcess()
process.crawl(Playbook)
process.start()
If I understand your code correctly, you're trying to loop over an integer with the line for i in lengthproduct: which should not work, since for loops require iterables. To write a for loop that would return values from 0 to lengthproduct you can use the range function.
However, in this case I think you just want to iterate over every found product.
You can do that as follows:
for product in products:
img = product.css("div.offer-type__image > img::attr(data-original)").get()

Retrieving Lyrics from Musixmatch

import requests
import json
import urllib
import lyricsgenius
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.dbsparta
def get_artist_id(artistName):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/artist.search?page_size=100&format=json&apikey=123&q_artist=" + artistName, headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['artist_list'][0]['artist']['artist_id']
# print(response.json()['message']['body']['artist_list'][0]['artist']['artist_id'])
def get_album_ids(artist_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
album_response = requests.get("https://api.musixmatch.com/ws/1.1/artist.albums.get?page_size=100&format=json&apikey=123&artist_id=" + str(artist_id), headers=headers)
album_response.encoding = 'UTF-8'
# counter = 0
# album_list = album_response.json()['message']['body']['album_list']
return album_response.json()['message']['body']['album_list']
# print(album_response.json()['message']['body']['album_list'])
# for album in album_list:
# # counter += 1
# print(album['album']['album_id'])
def get_album_tracks_ids(album_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/album.tracks.get?page_size=100&format=json&apikey=123&album_id=" + str(album_id), headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['track_list']
# def get_track_id(artist_id):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# response = requests.get("https://api.musixmatch.com/ws/1.1/track.search?page_size=100format=json&apikey=123&f_artist_id=" + str(artist_id), headers=headers)
# response.encoding = 'UTF-8'
# for tracks in response.json()['message']['body']['track_list']:
# print(tracks['track']['track_name'])
def get_track_lyrics(track_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/track.lyrics.get?apikey=123&track_id=" + str(track_id), headers=headers)
response.encoding = 'UTF-8'
# return response['message']['body']['lyrics']['lyrics_body']
return response.json()['message']['body']['lyrics']['lyrics_body']
def main():
stars_list = list(db.new_top200.find({}, {'_id': 0}))
for stars in stars_list:
print(stars['name'])
album_ids = get_album_ids(get_artist_id(stars['name']))
# if album_ids is not None:
for album_id in album_ids:
# if album_id is not None and get_album_tracks_ids(album_id['album']['album_id']) is not [] and get_album_tracks_ids(album_id['album']['album_id']) is not None:
track_ids = get_album_tracks_ids(album_id['album']['album_id'])
for track in track_ids:
# if track is not [] and track['track']['track_id'] is not [] and track is not None:
# if get_track_lyrics(track['track']['track_id']) is not [] and get_track_lyrics(track['track']['track_id']) is not None:
lyric = get_track_lyrics(track['track']['track_id'])
db.new_top200.update_one({'name': stars['name']},{'$push': {'lyrics': lyric } })
# get_track_id(get_artist_id('Kanye West'))
# get_album_ids(get_artist_id("Kanye West"))
# get_album_tracks(15565713)
if __name__ == "__main__":
# for album in get_album_ids(get_artist_id("Kanye West")):
# get_album_tracks_ids(album['album']['album_id'])
# get_track_lyrics(96610952)
# get_album_tracks_ids(15565713)
# get_album_ids(get_artist_id('Drake'))
main()
I'm trying to get ALL of the lyrics of an artist and store it in a database. For example, if the artist is "Drake" I want all of the lyrics stored in the 'lyrics' key in my database.
However, I get a bunch of unpredictable errors every time I run the same code. For example, it would be inserting 400 lyrics without any problem and suddenly I'll get an error saying that 'list indices must be integers or slices not str'. This error is quite confusing to me because I'm assuming that all of the json data are in the same format and I have a sudden error after processing 400 song lyrics with no problem enter image description here
I can run the same code and at about 200 song lyrics in, I'll get a json decode error and then when I can run it AGAIN and after processing a different amount of song lyrics I'll get the error I described in the beginning again.
Can someone explain the random nature of this error?
Thank you!
You are making assumptions about the data types that will be returned from the JSON. In your case I suspect that one of the json elements is a list not an object.
Your issue can be reproduced with this simple example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
print(my_dict['message']['body']['lyrics']['lyrics_body'])
gives:
TypeError: list indices must be integers or slices, not str
How do you fix it? You'll need to check each element matches what you expect; for example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
def checker(item, field):
if isinstance(item, dict):
return item.get(field)
else:
raise ValueError(f"'{item}' in field '{field}' is not a valid dict")
message = checker(my_dict, 'message')
body = checker(message, 'body')
lyrics = checker(body, 'lyrics')
print(checker(lyrics, 'lyrics'))
gives:
ValueError: '['Always look on the bright side of life']' in field 'lyrics' is not a valid dict

Categories