I want to save some images from a website using python urllib2 but when I run the code it saves something else.
This is my code:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
url = "http://m.jaaar.com/"
r = urllib2.Request(url, headers=headers)
page = urllib2.urlopen(r).read()
soup = BeautifulSoup(page)
imgTags = soup.findAll('img')
imgTags = imgTags[1:]
for imgTag in imgTags:
imgUrl = "http://www.jaaar.com" + imgTag['src']
imgUrl = imgUrl[0:-10] + imgUrl[-4:]
fileName = "khabarnak-" + imgUrl[-12:]
print fileName
imgData = urllib2.urlopen(imgUrl).read()
print imgUrl
output = open("C:\wamp\www\py\pishkhan\\" + fileName,'wb')
output.write(imgData)
output.close()
Any suggestions?
The site is returning a standard image back to you because you are scraping the site. Use the same 'trick' of setting the headers when retrieving the image:
imgRequest = urllib2.Request(imgUrl, headers=headers)
imgData = urllib2.urlopen(imgRequest).read()
Related
I am trying to explore the web scraping in python.Currently working with beautiful soup.I was trying to get names of the festivals from this site : https://www.skiddle.com/festivals .Everything was going pretty fine, except 1 page, this one: https://www.skiddle.com/festivals/front-end-data-test/. It says 'NoneType' object has no attribute 'find' any way i can get data from there?
Here is the code
import requests
from bs4 import BeautifulSoup
import lxml
import json
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64"
}
#collect all fests URLs
fests_urls_list = []
#for i in range(0, 120, 24):
for i in range(0, 24, 24):
url = f"https://www.skiddle.com/festivals/search/?ajaxing=1&sort=0&fest_name=&from_date=15%20Aug%202022&to_date=&maxprice=500&o={i}&bannertitle=August"
req = requests.get(url=url, headers=headers)
json_data = json.loads(req.text)
html_response = json_data["html"]
with open(f"data/index_{i}.html", "w", encoding="utf-8") as file:
file.write(html_response)
with open(f"data/index_{i}.html", "r", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
cards = soup.find_all("a", class_="card-details-link")
for item in cards:
fest_url = "https://www.skiddle.com" + item.get("href")
fests_urls_list.append(fest_url)
#collect fest info
for url in fests_urls_list:
req = requests.get(url=url, headers=headers)
try:
soup = BeautifulSoup(req.text, "lxml")
fest_name = soup.find("div", class_="MuiContainer-root MuiContainer-maxWidthFalse css-1krljt2").find("h1").text.strip()
fest_data = soup.find("div", class_="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-11 css-twt0ol").text.strip()
print(fest_data)
except Exception as ex :
print(ex)
print("This was not supposed to happen")
For a project I am trying to automate the uploading of plugins into WordPress, I am successfully logging in using requests but when attempting to upload a plugin I am getting a 403.
import requests
import re
import bs4 as bs
host ='testdomain.xx'
user = 'user'
pword = 'pass'
wp_admin = 'http://' + host + '/wp-admin/'
wp_login = 'http://' + host + '/wp-login.php'
wpplugin = 'http://' + host + '/wp-admin/update.php?action=upload-plugin'
with requests.Session() as session:
headers1 = { 'Cookie':'wordpress_test_cookie=WP Cookie check','User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7' }
datas ={'log':user,'pwd':pword,'wp-submit':'Log In','redirect_to':wp_admin, 'testcookie':'1'
}
session.post(wp_login, headers=headers1, data=datas)
session_cookie = session.cookies
cookie_dict = session_cookie.get_dict()
wp = session.get('http://'+host+'/wp-admin/plugins.php')
soup = bs.BeautifulSoup(wp.text,'html.parser')
inputsoup = (soup.find_all('input',{"id":"_wpnonce"}))
inputsoup = str(inputsoup).split()
for i in inputsoup:
if 'value' in i:
wp_nonce = i.replace('value="','').replace('"/>]','')
url = 'http://'+host+'/wp-admin/update.php?action=upload-plugin'
data = {
'_wp_http_referer': ("/wordpress/wp-admin/plugin-install.php"),
'_wpnonce': (wp_nonce),
'install-plugin-submit': ('Install Now'),
'pluginzip': ('plugin.zip', open('./plugin.zip','rb'),
'application/octet-stream')
}
upload_result = session.post(url, data=data)
print(upload_result)
I am trying to download images from a web page using BeautifulSoup. I am getting the following error
MissingSchema: Invalid URL
import requests
from bs4 import BeautifulSoup
import os
from os.path import basename
url = "https://xxxxxx"
#r = requests.get(url)
request_page = urlopen(url)
page_html = request_page.read()
request_page.close()
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link =image['src']
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
print(images)
I am unsure why. I know I can read the images fine because the print works fine until I aded the following code
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
I would be grateful for any help
thanks
EDIT
The url is
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2018"
I added the print link as requested and the output is below
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg/280px-Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Bee_on_Lavender_Blossom_2.jpg/250px-Bee_on_Lavender_Blossom_2.jpg
edit
I am just wondering if it the size of the name in the link? On looking at that it seems to be buried in a lot of folders before we get to the jpeg?
As I suspected based on the error, when you added that print statement you can see that the links you are trying to access are not valid urls.
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg needs to start with https:.
To fix this, simply add that to the image['src'].
Second issue you need to fix is that when you write the file, you are writing it as 'Natalya-Naryshkinajpg'. You need that with jpg as the file extesions: for example 'Natalya-Naryshkina.jpg' I fixed that as well.
Code:
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2019"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url, headers=headers)
page_html = r.text
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link = 'https:' + image['src']
#print(link)
if 'static' not in link:
try:
extension = link.split('.')[-1]
with open(name.replace(' ', '-').replace('/', '') + '.' + extension, 'wb') as f:
im = requests.get(link, headers=headers)
f.write(im.content)
print(name)
except Exception as e:
print(e)
print(images)
This should hopefully work:
import re
import requests
from bs4 import BeautifulSoup
site = 'https://books.toscrape.com/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
I am trying to query images with copy right free filter. Even though the url leads to the right settings in my code, for some reason the page that is read by both urllib and requests is the first few images without the copy right free and size filter. If anyone can help with this I would greatly appreciate it.
code:
#%%
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
url = 'https://google.com/search?q='
input = 'cat'
#string: tbm=isch --> means image search
#tbs=isz:m --> size medium
#il:cl --> copy right free(i think)
url = url+input+'&tbm=isch&tbs=isz:m%2Cil:cl'
print(url)
html = urlopen(Request(url, headers={'User-Agent': 'Google Chrome'}))
'''with urllib.request.urlopen(url) as response:
html = response.read()
print(html)'''
#print(str(r.content))
soup = BeautifulSoup(html.read(),'html.parser')
#print(soup.prettify)
#using soup to find all img tags
results = soup.find_all('img')
str_result = str(results)
print(str_result)
lst_result = str_result.split(',')
#trying to get the very first link for the images with the appropriate settings
link = lst_result[4].split(' ')[4].split('"')[1]
#print(link)
# writing into the appropriate testing file, to be changed
file = open('.img1.png','wb')
get_img = requests.get(link)
file.write(get_img.content)
file.close()
import requests
import re, json
extentions = ['jpg', 'jpeg', 'png', 'gif', 'svg']
# determine image extention (not guaranteed, some links lack the extension)
def extention(url):
# or use the "imghdr" package to determine the extention
for ext in extentions:
if url.endswith(f'.{ext}'):
return ext
return '.UNKNOWN'
URL = 'https://google.com/search'
params = {
'q': 'cat', # search term
'tbm': 'isch',
'tbs': 'isz:m,il:cl'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.3'
}
r = requests.get(URL, params=params, headers=headers)
html = r.text
matches = re.findall('data:\[(?!\])(.*?)],\ sideChannel', html) # the data lives in a script, not in <img> elements (initially)
data = json.loads(f'[{matches[1]}]')
images = []
for image in data[31][0][12][2]: # the data structure may change some day, but its consistent between requests as of now
if type(image) is list:
try:
images.append(image[1][3][0])
except:
pass
images = list(dict.fromkeys(images)) # remove duplicate links
# retrive and save the first image's data
print(images[0])
imgdata = requests.get(images[0], headers=headers).content
with open(f'img.{extention(images[0])}', 'wb') as file:
file.write(imgdata)
I m trying to get data using requests web scraping from this web site https://enlinea.sunedu.gob.pe/verificainscripcion, the parameter is a doc in the example 06950413, and the captcha, also a hide parameter called _token, I got using xpath, so in the case of the captcha I get the image using xpath too and also I downloaded the image in a imagenes folder, after that I wait the captcha using the input() while I type the captcha letters in a captcha.txt, next I type the captcha i hit enter to continue but, I got a response json captcha error. this is my code:
from time import sleep
import requests
from lxml import html
from PIL import Image # pip install Pillow
import io
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"Host": "enlinea.sunedu.gob.pe",
"Origin": "https://enlinea.sunedu.gob.pe",
"Referer": "https://enlinea.sunedu.gob.pe/verificainscripcion",
}
session = requests.Session()
login_form_url = 'https://enlinea.sunedu.gob.pe/verificainscripcion'
login_form_res = session.get(login_form_url, headers=headers)
sleep(5)
parser = html.fromstring(login_form_res.text)
special_token = parser.xpath('//input[#name="_token"]/#value')
print('token:', special_token[0])
span_image = parser.xpath('//div[#class="pull-right"]/span[#id="captchaImgPriv"]/img')[0].get("src")
print(span_image)
image_content = requests.get(span_image).content
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = './imagenes/captcha.jpg'
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=85)
input()
login_url = 'https://enlinea.sunedu.gob.pe/consulta'
login_data = {
"doc": "06950413",
"opcion": 'PUB',
"_token": special_token[0],
"icono": '',
"captcha": open('captcha.txt').readline().strip()
}
print(login_data)
rep = session.post(
login_url,
data=login_data,
headers=headers
)
print(rep.text)
Thanks in advance.
The issue was that you didn't use the session when you built the captcha image request. Requesting the image gives a cookie that should be sent back with the form :
The following script use beautifulsoup instead of xpath/lxml, download the captcha, shows the captcha, waits for user input and gets the data :
import requests
from bs4 import BeautifulSoup
from PIL import Image
import io
import json
host = "https://enlinea.sunedu.gob.pe"
doc = "06950413"
s = requests.Session()
r = s.get(f"{host}/verificainscripcion")
soup = BeautifulSoup(r.text, "html.parser")
payload = dict([
(t["name"],t.get("value",""))
for t in soup.find("form", {"id": "consultaForm"}).find_all("input")
])
payload["doc"] = doc
image_content = s.get(f'{host}/simplecaptcha').content
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
image.show()
captcha = input("please enter captcha : ")
payload["captcha"] = captcha
print(payload)
r = s.post("https://enlinea.sunedu.gob.pe/consulta", data = payload)
data = json.loads(r.text)
print(data)
output :
please enter captcha : YLthP
{'doc': '06950413', 'opcion': 'PUB', '_token': 'gfgUTJrqPcmM9lyFHqW0u5aOdoF4gNSJm60kUNRu', 'icono': '', 'nombre': '', 'captcha': 'YLthP'}
[{"ID":"1519729","NOMBRE":"OSORIO DELGADILLO, FLOR DE MARIA","DOC_IDENT":"DNI 06950413","GRADO":"<b>BACHILLER EN EDUCACION ","TITULO_REV":"<b>BACHILLER EN EDUCACION","GRADO_REV":null,"DIPL_FEC":"26\/06\/1987","RESO_FEC":"-","ESSUNEDU":"0","UNIV":"UNIVERSIDAD INCA GARCILASO DE LA VEGA ASOCIACI\u00d3N CIVIL","PAIS":"PERU","COMENTARIO":"-","TIPO":"N","TIPO_GRADO":"B","DIPL_TIP_EMI":null,"TIPO_INSCRI":null,"NUM_DIPL_REVA":null,"NUM_ORD_PAG":null,"V_ORIGEN":null,"NRO_RESOLUCION_NULIDAD":null,"FLG_RESOLUCION_NULIDAD":null,"FECHA_RESOLUCION_NULIDAD":null,"MODALIDAD_ESTUDIO":"-"}]