this python code download a image from a Google search result, its wrok fine but all of the images are low qualities
how download them with more quality
also sorry for my bad names of vals
import requests
import bs4
ss = "hello world"
sss = requests.utils.quote(ss)
print(sss)
jomle = str(sss)
utl = r"https://www.google.com/search?q="+jomle
req = requests.get(utl)
print("koobs")
soup = bs4.BeautifulSoup(req.text, "html.parser")
els = soup.find_all("img")
somethin = els[2]["src"]
print(somethin)
Related
i am wrtiting this code to get information about top movies and also download the image blong to the movie but on some image they downloaded but their size are 0 but they have size on disk when i kilick on the link of the image that i cant download it well its opening well and there is no problem in link
for exampele this is one of the link that images :
https://static.stacker.com/s3fs-public/styles/slide_desktop/s3/00000116_4_0.png
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://stacker.com/stories/1587/100-best-movies-all-time"
count = 0
local_description = ""
movie_data = []
data = requests.get(URL).text
soap = BeautifulSoup(data, "html.parser")
titles = soap.find_all(name="h2", class_="ct-slideshow__slide__text-container__caption")[1:]
description = soap.find_all(name="div", class_="ct-slideshow__slide__text-container__description")[1:]
images = soap.find_all(name="img", typeof="foaf:Image")[6:106]
for num in range(100):
movie_name = titles[num].getText().replace("\n", "")
local_des = description[num].find_all(name="p")[1:]
for s in local_des:
local_description = s.getText().replace(" ", "")
local_data = {"title": movie_name, "description": local_description}
movie_data.append(local_data)
movie_image_link = images[num].get("src")
response = requests.get(movie_image_link)
with open(f"images/{movie_name}.png", 'wb') as f:
f.write(response.content)
count += 1
print(count)
data_collected = pd.DataFrame(movie_data)
data_collected.to_csv("Data/100_movie.csv", index=False)
i found my problem in some movie name there was ":" and as you knwo you cant user ":"
in file names i fix the code with .replace()
movie_name.replace(":", "")
Once you get a response, check if it's empty before writing to disk. Might need to retry or the link may be bad.
I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?
You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].
Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.
So I wanted to get all of the pictures on this page(of the nba teams).
http://www.cbssports.com/nba/draft/mock-draft
However, my code gives a bit more than that. It gives me,
<img src="http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png" alt="Orlando Magic" width="30" height="30" border="0" />
How can I shorten it to only give me, http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png.
My code:
import urllib2
from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
# from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read())
rows = soup.findAll("table", attrs = {'class': 'data borderTop'})[0].tbody.findAll("tr")[2:]
for row in rows:
fields = row.findAll("td")
if len(fields) >= 3:
anchor = row.findAll("td")[1].find("a")
if anchor:
print anchor
I know this can be "traumatic", but for those automatically generated pages, where you just want to grab the damn images away and never come back, a quick-n-dirty regular expression that takes the desired pattern tends to be my choice (no Beautiful Soup dependency is a great advantage):
import urllib, re
source = urllib.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read()
## every image name is an abbreviation composed by capital letters, so...
for link in re.findall('http://sports.cbsimg.net/images/nba/logos/30x30/[A-Z]*.png', source):
print link
## the code above just prints the link;
## if you want to actually download, set the flag below to True
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Hope this helps!
To save all the images on http://www.cbssports.com/nba/draft/mock-draft,
import urllib2
import os
from BeautifulSoup import BeautifulSoup
URL = "http://www.cbssports.com/nba/draft/mock-draft"
default_dir = os.path.join(os.path.expanduser("~"),"Pictures")
opener = urllib2.build_opener()
urllib2.install_opener(opener)
soup = BeautifulSoup(urllib2.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
To save any particular image on http://www.cbssports.com/nba/draft/mock-draft,
use
soup.find("img",{"src":"image_name_from_source"})
You can use this functions for getting the list of all images url from url.
#
#
# get_url_images_in_text()
#
# #param html - the html to extract urls of images from him.
# #param protocol - the protocol of the website, for append to urls that not start with protocol.
#
# #return list of imags url.
#
#
def get_url_images_in_text(html, protocol):
urls = []
all_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', html, flags=re.IGNORECASE | re.MULTILINE | re.UNICODE)
for url in all_urls:
if not url[0].startswith("http"):
urls.append(protocol + url[0])
else:
urls.append(url[0])
return urls
#
#
# get_images_from_url()
#
# #param url - the url for extract images url from him.
#
# #return list of images url.
#
#
def get_images_from_url(url):
protocol = url.split('/')[0]
resp = requests.get(url)
return get_url_images_in_text(resp.text, protocol)