I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass
Related
I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)
My code below works but I want it to do the same exact thing but with the next page of the URL variable, this would be done by adding the number 1,2,3 depending on the page.
The code essentially scrapes a website that has the thumnails of various videos, it then returns the link to each video. I want it to do this for each page available
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
from urllib.request import Request, urlopen
URL = "domain.com/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
endof = soup.find_all('div',class_="th-image")
links = [a['href'] for a in soup.find_all('a', href=True)]
endoflinks = links[8:-8]
index = 0
for a in endoflinks:
index+=1
dwnlink = "domain.com"+ endoflinks[index]
r = requests.get(dwnlink)
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
This should help you get going:
URL = "domain.com/"
for i in list(range(0,10)):
print("domain.com/"+str(i))
r = requests.get(URL+str(i))
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
domain.com/0
domain.com/1
domain.com/2
domain.com/3
domain.com/4
domain.com/5
domain.com/6
domain.com/7
domain.com/8
domain.com/9
I need to download all the files from this page :
https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000&parentFilter=1433&childFilter=1433%7C1450&startMonth=1&startYear=2008&endMonth=6&endYear=2021
that have "Auction of" on their titles. This is the source for one of the files for example:
Auction of £2,500 million of 0 5/8% Treasury Gilt 2035
I am trying to adapt some code I found from another question, but the pages are coming back empty:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, url, destination_path = task
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*Auction of\?.*"))
tasks = [
(session, host + page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)
Check your regular expression syntax. The regex r".*Auction of\?.*" will only match titles with an actual of? in the title.
But the href= parameter will search against the URL in the link, so that won't help you much either. This will find the links with the matching titles:
links = soup.find_all("a", string=re.compile(r"Auction of\b"))
And this will extract their URLs so you can retrieve them:
[ file["href"] for file in links ]
This is what ended up working for me:
from bs4 import BeautifulSoup
import requests
import re
links = []
url = 'https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000000&parentFilter=1433&childFilter=1433|1450&startMonth=1&startYear=2000&endMonth=6&endYear=2021'
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for a in soup.find_all("a",{"aria-label":re.compile(r"^Auction of\b")}, href=True):
links.append(a['href'])
def download_file(url):
path = url.split('/')[-1].split('?')[0]
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
host = 'https://www.dmo.gov.uk/'
for link in links:
url = host + link
download_file(url)
The find_all() method accepts a function. You can create a lambda function to filter all for a tags that contain "Auction of":
for tag in soup.find_all(lambda t: t.name == "a" and "Auction of" in t):
print(tag.text)
Or, you can use an [attribute*=value]:
# Find all `aria-label` attributes under an `a` that contain `Auction of`
for tag in soup.select("a[aria-label*='Auction of']"):
print(tag.text)
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
I am trying to extract and download all images from a url.
I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg
I just want to get all the images without clicking on "Next" button
I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?
The following should extract all images from a given page and write it to the directory where the script is being run.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`
If you want only pictures then you can just download them without even scrapping the webpage. The all have the same URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
So simple code as that will give you all images:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
With Beautifulsoup you will have to click or go to the next page to scrap the images. If you want ot scrap each page individually try to scrathem using there class which is shutterset_katrina-kaifs-top-10-cutest-pics-gallery