import requests
import json
import urllib
import lyricsgenius
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.dbsparta
def get_artist_id(artistName):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/artist.search?page_size=100&format=json&apikey=123&q_artist=" + artistName, headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['artist_list'][0]['artist']['artist_id']
# print(response.json()['message']['body']['artist_list'][0]['artist']['artist_id'])
def get_album_ids(artist_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
album_response = requests.get("https://api.musixmatch.com/ws/1.1/artist.albums.get?page_size=100&format=json&apikey=123&artist_id=" + str(artist_id), headers=headers)
album_response.encoding = 'UTF-8'
# counter = 0
# album_list = album_response.json()['message']['body']['album_list']
return album_response.json()['message']['body']['album_list']
# print(album_response.json()['message']['body']['album_list'])
# for album in album_list:
# # counter += 1
# print(album['album']['album_id'])
def get_album_tracks_ids(album_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/album.tracks.get?page_size=100&format=json&apikey=123&album_id=" + str(album_id), headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['track_list']
# def get_track_id(artist_id):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# response = requests.get("https://api.musixmatch.com/ws/1.1/track.search?page_size=100format=json&apikey=123&f_artist_id=" + str(artist_id), headers=headers)
# response.encoding = 'UTF-8'
# for tracks in response.json()['message']['body']['track_list']:
# print(tracks['track']['track_name'])
def get_track_lyrics(track_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/track.lyrics.get?apikey=123&track_id=" + str(track_id), headers=headers)
response.encoding = 'UTF-8'
# return response['message']['body']['lyrics']['lyrics_body']
return response.json()['message']['body']['lyrics']['lyrics_body']
def main():
stars_list = list(db.new_top200.find({}, {'_id': 0}))
for stars in stars_list:
print(stars['name'])
album_ids = get_album_ids(get_artist_id(stars['name']))
# if album_ids is not None:
for album_id in album_ids:
# if album_id is not None and get_album_tracks_ids(album_id['album']['album_id']) is not [] and get_album_tracks_ids(album_id['album']['album_id']) is not None:
track_ids = get_album_tracks_ids(album_id['album']['album_id'])
for track in track_ids:
# if track is not [] and track['track']['track_id'] is not [] and track is not None:
# if get_track_lyrics(track['track']['track_id']) is not [] and get_track_lyrics(track['track']['track_id']) is not None:
lyric = get_track_lyrics(track['track']['track_id'])
db.new_top200.update_one({'name': stars['name']},{'$push': {'lyrics': lyric } })
# get_track_id(get_artist_id('Kanye West'))
# get_album_ids(get_artist_id("Kanye West"))
# get_album_tracks(15565713)
if __name__ == "__main__":
# for album in get_album_ids(get_artist_id("Kanye West")):
# get_album_tracks_ids(album['album']['album_id'])
# get_track_lyrics(96610952)
# get_album_tracks_ids(15565713)
# get_album_ids(get_artist_id('Drake'))
main()
I'm trying to get ALL of the lyrics of an artist and store it in a database. For example, if the artist is "Drake" I want all of the lyrics stored in the 'lyrics' key in my database.
However, I get a bunch of unpredictable errors every time I run the same code. For example, it would be inserting 400 lyrics without any problem and suddenly I'll get an error saying that 'list indices must be integers or slices not str'. This error is quite confusing to me because I'm assuming that all of the json data are in the same format and I have a sudden error after processing 400 song lyrics with no problem enter image description here
I can run the same code and at about 200 song lyrics in, I'll get a json decode error and then when I can run it AGAIN and after processing a different amount of song lyrics I'll get the error I described in the beginning again.
Can someone explain the random nature of this error?
Thank you!
You are making assumptions about the data types that will be returned from the JSON. In your case I suspect that one of the json elements is a list not an object.
Your issue can be reproduced with this simple example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
print(my_dict['message']['body']['lyrics']['lyrics_body'])
gives:
TypeError: list indices must be integers or slices, not str
How do you fix it? You'll need to check each element matches what you expect; for example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
def checker(item, field):
if isinstance(item, dict):
return item.get(field)
else:
raise ValueError(f"'{item}' in field '{field}' is not a valid dict")
message = checker(my_dict, 'message')
body = checker(message, 'body')
lyrics = checker(body, 'lyrics')
print(checker(lyrics, 'lyrics'))
gives:
ValueError: '['Always look on the bright side of life']' in field 'lyrics' is not a valid dict
Related
I'm trying to scrape the amazon listings, I am consistently getting a redirect error with my scraper. I even used the http.cookiejar.CookieJar and a urllib.request.HTTPCookieProcessor to avoid the redirect loop but still getting the error.
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
import time
import requests, random
from requests.exceptions import HTTPError
from socket import error as SocketError
from http.cookiejar import CookieJar
data =[]
def getdata (url):
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
]
user_agent = random.choice(user_agents)
header_ = {'User-Agent': user_agent}
req = urllib.request.Request(url, headers=header_)
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
amazon_html = response.read().decode('utf8', errors='ignore')
a_soup = soup(amazon_html,'html.parser')
cat = k
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
asin = e.find('a')['href'].replace('dp%2F', '/dp/').split('/dp/')[1].replace('%2','/ref').split('/ref')[0]
except:
asin = 'No ASIN Found'
try:
title = e.find('h2').text
except:
title = None
data.append({
'Category': cat,
'ASIN': asin,
'Title':title
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in'+ str(page)
except:
url = None
return url
keywords = ['headphone','mobile','router','smartwatch']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Output
HTTPError: HTTP Error 308: Permanent Redirect
Error Screenshot
Any ideas how I can correct this ?
The JSON template I'm working on looks like this:
My Code:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
with open("Live.csv", "w+", newline="", encoding="UTF-8") as f:
titlemenu = "id" + "\n"
f.write(titlemenu)
url = f'https://etcetcetc.com'
response = requests.get(url, headers=headers).json()
events = response['events']
for event in events:
if 'true' in event['hasEventPlayerStatistics']:
id = event['id']
row = str(id) + "\n"
f.write(row)
f.close()
The message error is:
TypeError: argument of type 'bool' is not iterable
How should I work this if to be able to retrieve these values?
The issue is here.
if 'true' in event['hasEventPlayerStatistics']:
event['hasEventPlayerStatistics'] is already a python bool object. So you only need to do
if event['hasEventPlayerStatistics']:
pass # do something
When you do .json call on the response object it already does the following translations.
JSON
Python
true
True
false
False
Hi everybody I'm trying to get the image captcha in a website to scrape it. My problem is that the url to get the image captcha contains a parameter where I can't find where it is from. So I got using parser.xpath but It doesn't work. This is my code:
import requests, io, re
from PIL import Image
from lxml import html
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
}
session = requests.Session()
login_url = 'https://www.sat.gob.pe/WebSiteV8/popupv2.aspx?t=6'
login_form_res = session.get(login_url, headers=headers)
myhtml = login_form_res.text
evalu = ''
for match in re.finditer(r'(mysession=)(.*?)(")', myhtml):
evalu = myhtml[match.start():match.end()]
evalu = evalu.replace("mysession=", "")
evalu = evalu.replace('"', '')
print(evalu)
url_infractions = 'https://www.sat.gob.pe/VirtualSAT/modulos/RecordConductor.aspx?mysession=' + evalu
login_form_res = session.get(url_infractions, headers=headers)
myhtml = login_form_res.text
parser = html.fromstring(login_form_res.text)
idPic = parser.xpath('//img[#class="captcha_class"]/#src')
urlPic = "https://www.sat.gob.pe/VirtualSAT" + idPic[0].replace("..","")
print(urlPic)
image_content = session.get(urlPic, headers=headers)
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB').content
image.show()
As a result I have an exception which says TypeError: a bytes-like object is required, not 'Response'. I'm confused. I will really appreciate your help. Thanks in advance
I use requests to get a webpage but failed.
Here is my code:
import requests
from bs4 import BeautifulSoup
url = 'http://db.house.qq.com/index.php?mod=search&city=bj'
headers = {}
headers['authority'] = 'db.house.qq.com'
headers['method'] = 'GET'
headers['path'] = '/index.php?mod=search&city=bj'
headers['scheme'] = 'https'
headers['accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
headers['accept-encoding'] = 'gzip, deflate, br'
headers['accept-language'] = 'en-US,en;q=0.9,zh-HK;q=0.8,zh;q=0.7,zh-CN;q=0.6,an;q=0.5'
headers['cookie'] = 'pgv_info=ssid=s9739254340; pgv_pvid=9743767040; ts_uid=2023229671; pac_uid=0_da940c972d7c0; h_uid=h592060229584922854; Hm_lvt_73f18bb34ff30f1061b904f30f86c5cb=1602238779; ts_refer=www.google.com/; ts_uid=6802299874; pgv_pvi=196710400; pgv_si=s9373821952; Hm_lpvt_73f18bb34ff30f1061b904f30f86c5cb=1602767734; hisuid=[%22h592060229584922854%22]; hisuin=[null]; feature={%2295%22:1%2C%2298%22:1}; ts_last=db.house.qq.com/index.php; ad_play_index=86'
headers['dnt'] = '1'
headers['sec-ch-ua'] = '"Chromium";v="86", "\"Not\\A;Brand";v="99", "Google Chrome";v="86"'
headers['sec-ch-ua-mobile'] = '?0'
headers['sec-fetch-dest'] = 'document'
headers['sec-fetch-mode'] = 'navigate'
headers['sec-fetch-site'] = 'none'
headers['upgrade-insecure-requests'] = '1'
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
print(soup.find('em', {'id':'search_result_num'}).text)#0, should be 3767
print('三湘印象·森林海尚城' in res.text)#False, should be True
How to solve this problem?
Thanks.
It was generated by Javascript.(Not json.),I caught this url in developer tool.
import requests
from bs4 import BeautifulSoup
import re
url = 'https://db.house.qq.com/index.php?mod=search&act=newsearch&city=bj&showtype=1&mod=search&city=bj'
# url = 'http://db.house.qq.com/index.php?mod=search&city=bj'
headers = {}
headers['authority'] = 'db.house.qq.com'
headers['method'] = 'GET'
headers['path'] = '/index.php?mod=search&city=bj'
headers['scheme'] = 'https'
headers['accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
headers['accept-encoding'] = 'gzip, deflate, br'
headers['accept-language'] = 'en-US,en;q=0.9,zh-HK;q=0.8,zh;q=0.7,zh-CN;q=0.6,an;q=0.5'
headers['cookie'] = 'pgv_info=ssid=s9739254340; pgv_pvid=9743767040; ts_uid=2023229671; pac_uid=0_da940c972d7c0; h_uid=h592060229584922854; Hm_lvt_73f18bb34ff30f1061b904f30f86c5cb=1602238779; ts_refer=www.google.com/; ts_uid=6802299874; pgv_pvi=196710400; pgv_si=s9373821952; Hm_lpvt_73f18bb34ff30f1061b904f30f86c5cb=1602767734; hisuid=[%22h592060229584922854%22]; hisuin=[null]; feature={%2295%22:1%2C%2298%22:1}; ts_last=db.house.qq.com/index.php; ad_play_index=86'
headers['dnt'] = '1'
headers['sec-ch-ua'] = '"Chromium";v="86", "\"Not\\A;Brand";v="99", "Google Chrome";v="86"'
headers['sec-ch-ua-mobile'] = '?0'
headers['sec-fetch-dest'] = 'document'
headers['sec-fetch-mode'] = 'navigate'
headers['sec-fetch-site'] = 'none'
headers['upgrade-insecure-requests'] = '1'
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
res = requests.get(url, headers=headers)
text = res.content.decode("unicode_escape") # escape this content.
print("三湘印象·森林海尚城" in text)
soup = BeautifulSoup(text, "lxml")
result = soup.find(id="search_result_page").find_all("a")[-1].text
print(re.search(r"search_result_list_num = (\d+);", result).group(1)) # use regex to find the amount of results.
Print:
True
3767
I am using scrapy when webscraping in Python and I am writing here because I want to figure out how I can for-loop through all values in a smart way.
I am trying with the logic below but seems to not work as I want it to...
products = category.css("div.offer-category__body > div.offer-category__item")
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
Do you have any good tips of how I should loop through the elements and pick up all the values I want?
I am trying to webscrape this website:
https://www.ica.se/butiker/nara/bjurholm/ica-nara-westmans-livs-231/butikserbjudanden/
But if you want to try the full code then you will need to insert this URL: https://www.ica.se/butiker/ into the a notebook and call the notebook "URLs to be scrapped.txt"
The full code is attached below:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_splash import SplashRequest
import csv
with open("URLs To be Scrapped.txt") as f:
URLs = f.readlines()
class Playbook(scrapy.Spider):
name = "PostcodesSpider"
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'DataFinal.csv',
}
script = """
function main(splash, args)
splash.private_mode_enabled = false
splash:go(args.url)
splash:wait(2)
splash:set_viewport_full()
return splash:html()
end
"""
def start_requests(self):
for url in URLs:
yield SplashRequest(url=url.strip(),
callback=self.parse, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
endpoint="execute", args={"lua_source": self.script, }
)
def parse(self, response):
stores = response.css("div.store-card-content > div:nth-of-type(2) > a:last-of-type::attr(href)").extract()
for store in stores:
yield scrapy.Request(url="https://www.ica.se/" + store.strip(),
callback=self.parse2, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}, )
def parse2(self, response):
storeName = response.css("h1.store-heading::text").extract_first()
categories = response.css("section.offer-category")
for category in categories:
categoryName = category.css("header.offer-category__header::text").extract_first()
products = category.css("div.offer-category__body > div.offer-category__item")
print("test")
lengthproduct=len(products)
print(lengthproduct)
for i in lengthproduct:
img = products(i).css("div.offer-type__image > img::attr(data-original)").extract_first()
yield {
"Store": storeName.strip(),
"Category": categoryName.strip(),
"Image": img,
}
process = CrawlerProcess()
process.crawl(Playbook)
process.start()
If I understand your code correctly, you're trying to loop over an integer with the line for i in lengthproduct: which should not work, since for loops require iterables. To write a for loop that would return values from 0 to lengthproduct you can use the range function.
However, in this case I think you just want to iterate over every found product.
You can do that as follows:
for product in products:
img = product.css("div.offer-type__image > img::attr(data-original)").get()