Scraped images is corrupt - python

Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.
Here is the code. Github here: https://github.com/kenpeter/py_mm
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)

The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src instead of src which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:
<img
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>
In other words you have to check for both attributes src and data-src giving data-src priority over src while creating the list of image URLs.
THIS code does the "trick" and downloads the proper images:
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls_1a = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
img_urls_2 = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#data-src")
img_urls = img_urls_1b + img_urls_2
# print(img_urls)
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)

Related

Automatic Image Download From Google Lens

I need images similar to my own pictures. So I'm trying to download auto images from google lens results.tutorial example link
The images in the google images section were downloaded but I could not download the images in the google lens. I would love any help. Which parts of the code should I change ?
import bs4
import requests
from selenium import webdriver
import os
import time
from selenium.webdriver.common.by import By
# creating a directory to save images
folder_name = 'images/soup'
if not os.path.isdir(folder_name):
os.makedirs(folder_name)
def download_image(url, folder_name, num):
# write image to file
reponse = requests.get(url)
if reponse.status_code == 200:
with open(os.path.join(folder_name, str(num) + ".jpg"), 'wb') as file:
file.write(reponse.content)
from selenium.webdriver.chrome.service import Service
chromePath='C:/Users/A/Downloads/chromedriver_win322/chromedriver.exe'
driver=webdriver.Chrome(chromePath)
# s = Service('C:/Users/ASUS/Downloads/chromedriver_win322/chromedriver.exe')
# driver = webdriver.Chrome(service=s)
search_URL = "https://www.google.com/search?q=sulu+k%C3%B6fte&tbm=isch&ved=2ahUKEwiNiqr85YD9AhXcwwIHHT59D74Q2-cCegQIABAA&oq=sulu+k&gs_lcp=CgNpbWcQARgAMggIABCABBCxAzIICAAQgAQQsQMyBQgAEIAEMgsIABCABBCxAxCDATIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDoECAAQQzoICAAQsQMQgwE6BwgAELEDEEM6BQgAELEDUI8IWL4bYL8kaAFwAHgAgAG_AYgBpAiSAQMwLjiYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=FeXgY82rG9yHi-gPvvq98As&bih=722&biw=1519&hl=tr"
driver.get(search_URL)
# //*[#id="islrg"]/div[1]/div[1]
# //*[#id="islrg"]/div[1]/div[50]
# //*[#id="islrg"]/div[1]/div[25]
# //*[#id="islrg"]/div[1]/div[75]
# //*[#id="islrg"]/div[1]/div[350]
a = input("Waiting...")
# Scrolling all the way up
driver.execute_script("window.scrollTo(0, 0);")
page_html = driver.page_source
pageSoup = bs4.BeautifulSoup(page_html, 'html.parser')
containers = pageSoup.findAll('div', {'class': "isv-r PNCib MSM1fd BUooTd"})
print(len(containers))
len_containers = len(containers)
for i in range(1, len_containers + 1):
if i % 25 == 0:
continue
xPath = """//*[#id="islrg"]/div[1]/div[%s]""" % (i)
previewImageXPath = """//*[#id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img""" % (i)
previewImageElement = driver.find_element_by_xpath(previewImageXPath)
previewImageURL = previewImageElement.get_attribute("src")
# print("preview URL", previewImageURL)
# print(xPath)
driver.find_element_by_xpath(xPath).click()
# time.sleep(3)
# //*[#id="islrg"]/div[1]/div[16]/a[1]/div[1]/img
# input('waawgawg another wait')
# page = driver.page_source
# soup = bs4.BeautifulSoup(page, 'html.parser')
# ImgTags = soup.findAll('img', {'class': 'n3VNCb', 'jsname': 'HiaYvf', 'data-noaft': '1'})
# print("number of the ROI tags", len(ImgTags))
# link = ImgTags[1].get('src')
# #print(len(ImgTags))
# #print(link)
#
# n=0
# for tag in ImgTags:
# print(n, tag)
# n+=1
# print(len(ImgTags))
# /html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img
# It's all about the wait
timeStarted = time.time()
while True:
# driver.find_element(By.XPATH, "//*[#id='search']").click()
#imageElement = driver.find_element(By.XPATH, '//*[#id="yDmH0d"]/div[6]/div/div[2]/div[2]/div['
# imageElement = driver.find_element_by_xpath(
# """//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img""")
# imageElement = driver.find_element(By.XPATH,'//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img')
imageElement = driver.find_element_by_xpath(
"""//*[#id="Sva75c"]/div[2]/div/div[2]/div[2]/div[2]/c-wiz/div[2]/div[1]/div[1]/div[2]/div/a/img""") # titles = driver.find_elements(By.XPATH, '//div[#id="srp-river-results"]//img[#class="s-item__image-img"]')
# titles = driver.find_elements(By.XPATH, '//img[#class="s-item__image-img"]')
imageURL = imageElement.get_attribute('src')
if imageURL != previewImageURL:
# print("actual URL", imageURL)
break
else:
# making a timeout if the full res image can't be loaded
currentTime = time.time()
if currentTime - timeStarted > 10:
print("Timeout! Will download a lower resolution image and move onto the next one")
break
# Downloading image
try:
download_image(imageURL, folder_name, i)
print("Downloaded element %s out of %s total. URL: %s" % (i, len_containers + 1, imageURL))
except:
print("Couldn't download an image %s, continuing downloading the next one" % (i))

how i can open up many links with chromedriver that are stored in one variable

So i wrote this code for now to web-scrape cnn and get articles about a specific topic:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os
serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'
url = f'https://edition.cnn.com/search?q={real_serch}'
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()
SAVED_DATA = "data.json"
def save_data(filepath, data):
with open(filepath, "w") as f:
json.dump(data, f)
def load_data(filepath):
try:
with open(filepath, "r") as f:
data = json.load(f)
return data
except:
return {}
def only_get_title():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title=h3.text
return(title)
def get_href():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
return(abs_url)
def store():
data = load_data(SAVED_DATA)
key = only_get_title()
data[key] = get_href()
save_data(SAVED_DATA, data)
print("News saved!")
if __name__ == '__main__':
store()
my question is that in abs_url are stored many links, of the different articles taht were found on that subject on cnn so, i want to go to every of these links and save the data, but it will only open up the first link stored in abs_url and not the other how can i do that i open up every link and save every link in my json file as you can see in the code
You run return inside for-loop so you exit function at first link.
You should add all links to list and use return yourlist after for-loop
def get_href():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
all_results.append( abs_url)
# --- after loop --
return all_results
The same problem you have with titles
def only_get_title():
all_results = []
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
all_results.append(title)
# --- after loop --
return all_results
and later you will need to use for-loop with zip() to create pairs (title, url)
def store():
data = load_data(SAVED_DATA)
all_titles = only_get_title()
all_urls = get_href()
for title, url in zip(all_titles, all_urls):
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
But maybe it would be simpler and more readable to get title and url in one function and create pairs when you add to list
def get_articles():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url = h3.get('href')
abs_url = 'https:'+ url
pair = (title, abs_url)
all_results.append( pair )
# --- after loop --
return all_results
def store():
data = load_data(SAVED_DATA)
all_articles = get_articles()
for title, url in all_articles:
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
And this can be also safer when you want to get more details from article because if articel doesn't have some details then you can add None or default value. Using separated function it may skip empty elements and later zip() will create wrog pairs (tuples)

IndexError while scraping from webpage

I have been trying to scrape data from xhamster channels for my research using this code
import json
from multiprocessing.dummy import Pool as ThreadPool
from lxml import html
from util import req
def get_channel_urls(url):
r = req(url)
tree = html.fromstring(r.text)
print("Done", url)
return [x.attrib['href'] for x in tree.xpath('//div[#class="item"]/a')]
def write_channel_data(url):
r = req(url)
html_text = r.text
tree = html.fromstring(html_text)
json_data = json.loads(
tree.xpath('//script[#id="initials-script"]/text()')[0].strip().split("window.initials =")[1][:-1].strip())
with open("channel_html/{}".format(json_data['sponsorChannel']['inurl']), 'w', encoding='utf-8') as outfile:
outfile.write(html_text)
print("Written data for:", url)
def main():
letters = '0abcdefghijklmnopqrstuvqxyz'
index_urls = ['https://xhamster.com/channels/all/{}'.format(index_letter) for index_letter in letters]
index_urls.extend(['https://xhamster.com/gay/channels/all/{}'.format(index_letter) for index_letter in letters])
index_urls.extend(['https://xhamster.com/shemale/channels/all/{}'.format(index_letter) for index_letter in letters])
channel_urls = []
for url in index_urls:
channel_urls.extend(get_channel_urls(url))
with open('channel_urls', 'w') as channel_url_backup_file:
channel_url_backup_file.write("\n".join(channel_urls))
# with open('channel_urls') as i: # THIS IS TO READ A PRE-DOWNLOADED URL FILE
# channel_urls = [url.strip() for url in i.read().split()]
with ThreadPool(processes=10) as pool:
pool.map(write_channel_data, channel_urls)
if __name__ == '__main__':
main()
It does work for a while but then i get this error. The error is obviously in the main() funtion but I cant figure how to solve it
IndexError: list out of index

How to download pdf files from URLs leading to sub-URLs using Python

I am trying to download all pdf files from the links in the following URLs:
https://www.adb.org/projects/documents/country/ban/year/2020?terms=education
https://www.adb.org/projects/documents/country/ban/year/2019?terms=education
https://www.adb.org/projects/documents/country/ban/year/2018?terms=education
These URLs have lists of links which directs to sub-links containing pdf files. The lists of links in the main URLs come from the search result of a country, year and a term.
I have tried with the following codes by changing it in different ways. However, it does not seem to be working. Any help would be appreciated. Thanks.
import os
import time
from glob import glob
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = ["https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"]
folder = glob("J:/pdfs/*/")
for i, folder_location in zip(url, folder):
time.sleep(1)
response = requests.get(i)
soup= BeautifulSoup(response.text, "lxml")
for link in soup.select("[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(i,link['href'])).content)
Try this. It will put the files in the PDF folder.
import os
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
allowed_domains = ["www.adb.org"]
start_urls = [
"https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
] # Entry page
def __init__(self):
Spider.__init__(self, self.name) #necessary
if (not os.path.exists('./pdfs')):
os.mkdir('./pdfs')
def afterResponse(self, response, url, error=None, extra=None):
try:
path = './pdfs' + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('div.list >a').contains("documents/", attr="href")
if not lst:
lst = doc.selects('div.hidden-md hidden-lg >a')
urls = []
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
urls.append(a)
return {"Urls": urls}
SimplifiedMain.startThread(MySpider()) # Start download
The pdf from each url be downloaded to each separate folder.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
allowed_domains = ["www.adb.org"]
start_urls = [
"https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
] # Entry page
def afterResponse(self, response, url, error=None, extra=None):
if not extra:
print ("The version of library simplified_scrapy is too old, please update.")
SimplifiedMain.setRunFlag(False)
return
try:
path = './pdfs'
# create folder start
srcUrl = extra.get('srcUrl')
if srcUrl:
index = srcUrl.find('year/')
year = ''
if index > 0:
year = srcUrl[index + 5:]
index = year.find('?')
if index>0:
path = path + year[:index]
utils.createDir(path)
# create folder end
path = path + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error, extra)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('div.list >a').contains("documents/", attr="href")
if not lst:
lst = doc.selects('div.hidden-md hidden-lg >a')
urls = []
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
# Set root url start
a["srcUrl"] = url.get('srcUrl')
if not a['srcUrl']:
a["srcUrl"] = url.url
# Set root url end
urls.append(a)
return {"Urls": urls}
# Download again by resetting the URL. Called when you want to download again.
def resetUrl(self):
Spider.clearUrl(self)
Spider.resetUrlsTest(self)
SimplifiedMain.startThread(MySpider()) # Start download

How to extract and download all images from a website using beautifulSoup?

I am trying to extract and download all images from a url.
I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg
I just want to get all the images without clicking on "Next" button
I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?
The following should extract all images from a given page and write it to the directory where the script is being run.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`
If you want only pictures then you can just download them without even scrapping the webpage. The all have the same URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
So simple code as that will give you all images:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
With Beautifulsoup you will have to click or go to the next page to scrap the images. If you want ot scrap each page individually try to scrathem using there class which is shutterset_katrina-kaifs-top-10-cutest-pics-gallery

Categories