How to download image captcha using requests.Session from specific url - python

Hi everybody I'm trying to get the image captcha in a website to scrape it. My problem is that the url to get the image captcha contains a parameter where I can't find where it is from. So I got using parser.xpath but It doesn't work. This is my code:
import requests, io, re
from PIL import Image
from lxml import html
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
}
session = requests.Session()
login_url = 'https://www.sat.gob.pe/WebSiteV8/popupv2.aspx?t=6'
login_form_res = session.get(login_url, headers=headers)
myhtml = login_form_res.text
evalu = ''
for match in re.finditer(r'(mysession=)(.*?)(")', myhtml):
evalu = myhtml[match.start():match.end()]
evalu = evalu.replace("mysession=", "")
evalu = evalu.replace('"', '')
print(evalu)
url_infractions = 'https://www.sat.gob.pe/VirtualSAT/modulos/RecordConductor.aspx?mysession=' + evalu
login_form_res = session.get(url_infractions, headers=headers)
myhtml = login_form_res.text
parser = html.fromstring(login_form_res.text)
idPic = parser.xpath('//img[#class="captcha_class"]/#src')
urlPic = "https://www.sat.gob.pe/VirtualSAT" + idPic[0].replace("..","")
print(urlPic)
image_content = session.get(urlPic, headers=headers)
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB').content
image.show()
As a result I have an exception which says TypeError: a bytes-like object is required, not 'Response'. I'm confused. I will really appreciate your help. Thanks in advance

Related

Can't retrieve an email from a webpage using the requests module

I'm trying to fetch an email from a webpage using requests module. The problem is, the email address seems to be encoded or something, which is why it is unreadable, and I wish to decode it in its usual form.
import requests
from bs4 import BeautifulSoup
link = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
email = soup.select_one("script[type='text/javascript']:-soup-contains('emailProtector')").contents[0]
print(email)
When I run the above script, the following is what I get:
emailProtector.addCloakedMailto("ep_586c4771", 1);
This is the result I'm after:
fttextilegroup2017#gmail.com
You can try:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
def decloak(cloaked_tag, attr_name):
a, b = "" , ""
for span in cloaked_tag.select('span'):
for attr in span.attrs:
if attr == attr_name:
a += span[attr]
else:
b = span[attr] + b
return a + b
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
attr_name = re.search(r'nodeName\.toLowerCase\(\)\.indexOf\("(.*?)"', str(soup)).group(1)
mail = decloak(soup.select_one('.cloaked_email'), attr_name)
print(mail)
Prints:
fttextilegroup2017#gmail.com

Data are overwrite in pandas

When I make the csv file data are overwrite in csv file If there is any solution provide me the link of the page is https://www.aeafa.es/asociados.php?provinput=&_pagi_pg=1 have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
for page in range(1,3):
r =requests.get('https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}'.format(page=page),
headers=headers)
soup=BeautifulSoup(r.content, 'lxml')
tag=soup.find_all('div',class_='col-md-8 col-sm-8')
temp=[]
for pro in tag:
data=[tup.text for tup in pro.find_all('p')]
Dirección=data[2]
Dirección=Dirección[12:]
Población=data[3]
Población=Población[14:]
Provincia=data[4]
Provincia=Provincia[14:]
Teléfono=data[5]
Teléfono="+" + Teléfono[11:].replace('.', "")
Email=data[6]
Email=Email[10:]
temp.append([Dirección,Provincia,Población,Teléfono, Email])
df=pd.DataFrame(temp,columns=["Dirección","Provincia","Población","Teléfono","Email"])
df.to_csv('samp.csv')
Try to put the list temp outside of the for-loop. Then, create the dataframe after all the loops finish:
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
for page in range(1, 3):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
Población = data[3]
Población = Población[14:]
Provincia = data[4]
Provincia = Provincia[14:]
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Email = data[6]
Email = Email[10:]
temp.append([Dirección, Provincia, Población, Teléfono, Email])
df = pd.DataFrame(
temp, columns=["Dirección", "Provincia", "Población", "Teléfono", "Email"]
)
df.to_csv("samp.csv")
print(len(df))
Prints:
98
Screenshot from LibreOffice:

How to grab image links correctly? My scraper only make blank folders

My code is only making empty folders and not downloading images.
So, I think I need it to be modified so that the images can be clearly downloaded.
I tried to fix it by myself, but can't figure it out how to do.
Anyone please help me. Thank you!
import requests
import parsel
import os
import time
for page in range(1, 310): # Total 309pages
print(f'======= Scraping data from page {page} =======')
url = f'https://www.bikeexif.com/page/{page}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
containers = selector.xpath('//div[#class="container"]/div/article[#class="smallhalf"]')
for v in containers:
old_title = v.xpath('.//div[2]/h2/a/text()').get()#.replace(':', ' -')
if old_title is not None:
title = old_title.replace(':', ' -')
title_url = v.xpath('.//div[2]/h2/a/#href').get()
print(title, title_url)
if not os.path.exists('img\\' + title):
os.mkdir('img\\' + title)
response_image = requests.get(url=title_url, headers=headers).text
selector_image = parsel.Selector(response_image)
# Full Size Images
images_url = selector_image.xpath('//div[#class="image-context"]/a[#class="download"]/#href').getall()
for title_url in images_url:
image_data = requests.get(url=title_url, headers=headers).content
file_name = title_url.split('/')[-1]
time.sleep(1)
with open(f'img\\{title}\\' + file_name, mode='wb') as f:
f.write(image_data)
print('Download complete!!:', file_name)
This page uses JavaScript to create link "download" but requests/urllib/beautifulsoup/lxml/parsel/scrapy can't run JavaScript - and this makes problem.
But it seems page uses the same urls to display images on page - so you may use //img/#src
But this makes another problem because page uses JavaScript for "lazy loading" images and only first img has src. Other images have url in data-src (and normally Javascript copy data-src to src when you scroll page) so you have to get data-src to download some of images.
You need something like this to get #src (for first image) and #data-src (for other images).
images_url = selector_image.xpath('//div[#id="content"]//img/#src').getall() + \
selector_image.xpath('//div[#id="content"]//img/#data-src').getall()
Full working code (with other small changes)
Because I use Linux so string img\\{title} creates wrong path
so I use os.path.join('img', title, filename) to create correct path on Windows, Linux, Mac.
import requests
import parsel
import os
import time
# you can define it once
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
for page in range(1, 310): # Total 309pages
print(f'======= Scraping data from page {page} =======')
url = f'https://www.bikeexif.com/page/{page}'
response = requests.get(url, headers=headers)
selector = parsel.Selector(response.text)
containers = selector.xpath('//div[#class="container"]/div/article[#class="smallhalf"]')
for v in containers:
old_title = v.xpath('.//div[2]/h2/a/text()').get()#.replace(':', ' -')
if old_title is not None:
title = old_title.replace(':', ' -')
title_url = v.xpath('.//div[2]/h2/a/#href').get()
print(title, title_url)
os.makedirs( os.path.join('img', title), exist_ok=True ) # it create only if doesn't exists
response_article = requests.get(url=title_url, headers=headers)
selector_article = parsel.Selector(response_article.text)
# Full Size Images
images_url = selector_article.xpath('//div[#id="content"]//img/#src').getall() + \
selector_article.xpath('//div[#id="content"]//img/#data-src').getall()
print('len(images_url):', len(images_url))
for img_url in images_url:
response_image = requests.get(url=img_url, headers=headers)
filename = img_url.split('/')[-1]
with open( os.path.join('img', title, filename), 'wb') as f:
f.write(response_image.content)
print('Download complete!!:', filename)

What is the fix for this Error: 'NoneType' object has no attribute 'prettify'

I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)

Retrieving Lyrics from Musixmatch

import requests
import json
import urllib
import lyricsgenius
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.dbsparta
def get_artist_id(artistName):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/artist.search?page_size=100&format=json&apikey=123&q_artist=" + artistName, headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['artist_list'][0]['artist']['artist_id']
# print(response.json()['message']['body']['artist_list'][0]['artist']['artist_id'])
def get_album_ids(artist_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
album_response = requests.get("https://api.musixmatch.com/ws/1.1/artist.albums.get?page_size=100&format=json&apikey=123&artist_id=" + str(artist_id), headers=headers)
album_response.encoding = 'UTF-8'
# counter = 0
# album_list = album_response.json()['message']['body']['album_list']
return album_response.json()['message']['body']['album_list']
# print(album_response.json()['message']['body']['album_list'])
# for album in album_list:
# # counter += 1
# print(album['album']['album_id'])
def get_album_tracks_ids(album_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/album.tracks.get?page_size=100&format=json&apikey=123&album_id=" + str(album_id), headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['track_list']
# def get_track_id(artist_id):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# response = requests.get("https://api.musixmatch.com/ws/1.1/track.search?page_size=100format=json&apikey=123&f_artist_id=" + str(artist_id), headers=headers)
# response.encoding = 'UTF-8'
# for tracks in response.json()['message']['body']['track_list']:
# print(tracks['track']['track_name'])
def get_track_lyrics(track_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/track.lyrics.get?apikey=123&track_id=" + str(track_id), headers=headers)
response.encoding = 'UTF-8'
# return response['message']['body']['lyrics']['lyrics_body']
return response.json()['message']['body']['lyrics']['lyrics_body']
def main():
stars_list = list(db.new_top200.find({}, {'_id': 0}))
for stars in stars_list:
print(stars['name'])
album_ids = get_album_ids(get_artist_id(stars['name']))
# if album_ids is not None:
for album_id in album_ids:
# if album_id is not None and get_album_tracks_ids(album_id['album']['album_id']) is not [] and get_album_tracks_ids(album_id['album']['album_id']) is not None:
track_ids = get_album_tracks_ids(album_id['album']['album_id'])
for track in track_ids:
# if track is not [] and track['track']['track_id'] is not [] and track is not None:
# if get_track_lyrics(track['track']['track_id']) is not [] and get_track_lyrics(track['track']['track_id']) is not None:
lyric = get_track_lyrics(track['track']['track_id'])
db.new_top200.update_one({'name': stars['name']},{'$push': {'lyrics': lyric } })
# get_track_id(get_artist_id('Kanye West'))
# get_album_ids(get_artist_id("Kanye West"))
# get_album_tracks(15565713)
if __name__ == "__main__":
# for album in get_album_ids(get_artist_id("Kanye West")):
# get_album_tracks_ids(album['album']['album_id'])
# get_track_lyrics(96610952)
# get_album_tracks_ids(15565713)
# get_album_ids(get_artist_id('Drake'))
main()
I'm trying to get ALL of the lyrics of an artist and store it in a database. For example, if the artist is "Drake" I want all of the lyrics stored in the 'lyrics' key in my database.
However, I get a bunch of unpredictable errors every time I run the same code. For example, it would be inserting 400 lyrics without any problem and suddenly I'll get an error saying that 'list indices must be integers or slices not str'. This error is quite confusing to me because I'm assuming that all of the json data are in the same format and I have a sudden error after processing 400 song lyrics with no problem enter image description here
I can run the same code and at about 200 song lyrics in, I'll get a json decode error and then when I can run it AGAIN and after processing a different amount of song lyrics I'll get the error I described in the beginning again.
Can someone explain the random nature of this error?
Thank you!
You are making assumptions about the data types that will be returned from the JSON. In your case I suspect that one of the json elements is a list not an object.
Your issue can be reproduced with this simple example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
print(my_dict['message']['body']['lyrics']['lyrics_body'])
gives:
TypeError: list indices must be integers or slices, not str
How do you fix it? You'll need to check each element matches what you expect; for example:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
def checker(item, field):
if isinstance(item, dict):
return item.get(field)
else:
raise ValueError(f"'{item}' in field '{field}' is not a valid dict")
message = checker(my_dict, 'message')
body = checker(message, 'body')
lyrics = checker(body, 'lyrics')
print(checker(lyrics, 'lyrics'))
gives:
ValueError: '['Always look on the bright side of life']' in field 'lyrics' is not a valid dict

Categories