#!/usr/bin/env python
import urllib
import mechanize
from bs4 import BeautifulSoup
from urlparse import urlparse
def getPic(search):
search = search.replace(" ","%20")
try:
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-Agent','Mozilla')]
htmltext = browser.open("https://www.google.com/search?site=&tbm=isch&source=hp&biw=1855&bih=990&q=" + search + "&oq=" +search)
img_url = []
formatted_images = []
soup = BeautifulSoup(htmltext)
results = soup.findAll("a")
for r in results:
try:
if "imgres?imgurl" in r['href']:
img_url.append(r['href'])
except:
a=0
for im in img_url:
refer_url = urlparse(str(img_url[0]))
return refer_url.query.split("&")[0].replace("imgurl=","")
return formatted_images
except:
print "error"
print getPic("occupy wall street")
Instead of getting the link of an image as output I'm getting "[]" as an output.Can someone figure out what's the problem with my code.
Google sends "imgres?imgurl" only to browser with JavaScript
but mechanize.Browser() is like browser without JavaScript.
Turn off JavaScript in your browser and see HTML send by Google.
Related
Hi guys I am trying to work this code to download images from Google. I am helpless at this point because I have tried everything in my power to figure out what is going on and I still don't know what's up. Please have a look at the code below and the error message that I am getting.
The code runs essentially... But the problem is that it opens the browser, scrolls through the page but then the images are not downloaded and I get an error message...
import requests
import time
import urllib
import argparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from fake_useragent import UserAgent
from multiprocessing import Pool
from lxml.html import fromstring
import os, sys
import wget
no=1
def search(url):
# Create a browser
browser = webdriver.Chrome('chromedriver')
browser.implicitly_wait(30)
# Open the link
browser.get(url)
time.sleep(0.5)
element = browser.find_element_by_tag_name("body")
# Scroll down
for i in range(40):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.1)
browser.find_element_by_id("smb").click()
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
time.sleep(1)
# Get page source and close the browser
source = browser.page_source
browser.close()
return source
def download_image(link):
global no
#print link
# Use a random user agent header
headers = {"User-Agent": ua.random}
# Get the image link
try:
r = requests.get("https://www.google.com" + link.get("href"), headers=headers)
except:
print("Cannot get link.")
title = fromstring(r.content).findtext(".//title")
link_url = title.split(" ")[-1]
print(link_url)
if link_url.find(".jpg")==len(link_url)-4:
# Download the image
wget.download(link_url, str(os.getcwd()) + "/" + query+"/"+str(no)+".jpg")
no=no+1
# set stack limit
sys.setrecursionlimit(1000)
# get user input and search on google
query = input("Enter the name you want to search")
url = "https://www.google.com/search?as_st=y&tbs=isz%3Alt%2Cislt%3Asvga%2Citp%3Aphoto%2Cift%3Ajpg&tbm=isch&sa=1&ei=H_-KW6GSHImGoAS3z4DYCA&q=" +query+"&oq="+query+"&gs_l=img.3..0l10.19389.19389.0.21095.1.1.0.0.0.0.113.113.0j1.1.0....0...1c.1.64.img..0.1.111....0.QpKT5Qs8Kdo"
print(url)
source = search(url)
count=1
# Parse the page source and download pics
page_text = source.encode('utf-8').decode('ascii', 'ignore')
soup = BeautifulSoup(page_text, "html.parser")
ua = UserAgent()
# check directory and create if necessary
if not os.path.isdir(query):
os.makedirs(query)
os.chdir(str(os.getcwd()) + "/" + query)
# get the links
links = soup.find_all("a", class_="rg_l")
for a in links[0:count]:
try:
download_image(a)
except:
pass
and I get this error... I have tried to add browser.implicitly_wait(30) to the code but this method does not work either...
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="smb"]"}
(Session info: chrome=83.0.4103.116)
Could you pleaaaase tell me how to resolve this :( thank you in advance!!
This is the code I wrote in python for opening a url.
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import requests
from random import randint
import urllib.parse
class AmazonReviews():
def __init__(self):
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def open_url(self,url):
values = {}
data = urllib.parse.urlencode(values).encode('utf-8')
req = urllib.request.Request(url, data, self.headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def fetch_reviews(self,all_reviews_link):
try:
url = "https://www.amazon.in" + all_reviews_link
print(url)
html = self.open_url(url)
except HTTPError as e:
print(e)
review = AmazonReviews()
review.fetch_reviews('/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8')
I am passing url as such because in the main project this url is scraped using href attribute that gives the relative path.
If there is any method to get absolute url please suggest.
Output -
https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8
HTTP Error 404: NotFound
Link of the code
https://onlinegdb.com/SyFPXzWVI
Use Selenium instead:
from selenium import webdriver
import os
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver")
link = "https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8"
browser.get(link)
name = browser.find_element_by_xpath('//*[#id="customer-profile-name-header"]/div[2]/span').text
Output:
Dheeraj Malhotra
I want to scrape a website and its sub-pages, but it is taking too long. How can I optimize the request or use an alternative solution?
Below is the code I am using. It takes 10s for just loading the Google home page. So it's clearly not scalable if I were to give it 280 links
from selenium import webdriver
import time
# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')
# start chrome browser
browser = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver" ,chrome_options=options)
start=time.time()
browser.get('http://www.google.com/xhtml')
print(time.time()-start)
browser.quit()
Use python requests and Beautiful soup module.
import requests
from bs4 import BeautifulSoup
url="https://tajinequiparle.com/dictionnaire-francais-arabe-marocain/"
url1="https://tajinequiparle.com/dictionnaire-francais-arabe-marocain/{}/"
req = requests.get(url,verify=False)
soup = BeautifulSoup(req.text, 'html.parser')
print("Letters : A")
print([item['href'] for item in soup.select('.columns-list a[href]')])
letters=['B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
for letter in letters:
req = requests.get(url1.format(letter), verify=False)
soup = BeautifulSoup(req.text, 'html.parser')
print('Letters : ' + letter)
print([item['href'] for item in soup.select('.columns-list a[href]')])
you can use that script for the speed. multithread crawler better than all:
https://edmundmartin.com/multi-threaded-crawler-in-python/
After that you must change that code:
def run_scraper(self):
with open("francais-arabe-marocain.csv", 'a') as file:
file.write("url")
file.writelines("\n")
for i in range(50000):
try:
target_url = self.to_crawl.get(timeout=600)
if target_url not in self.scraped_pages and "francais-arabe-marocain" in target_url:
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
df = pd.DataFrame([{'url': target_url}])
df.to_csv(file, index=False, header=False)
print(target_url)
except Empty:
return
except Exception as e:
print(e)
continue
If url include "francais-arabe-marocain" save urls in a csv file.
After that you can scrape that urls in one for loop reading csv line by line with same way
try to use urllib just like this
import urllib.request
start=time.time()
page = urllib.request.urlopen("https://google.com/xhtml")
print(time.time()-start)
it took only 2s. However, it depends also on the quality of connection you have
I want to submit a multipart/form-data that sets the input for a simulation on TRILEGAL, and download the file available from a redirected page.
I studied documentation of requests, urllib, Grab, mechanize, etc. , and it seems that in mechanize my code would be :
from mechanize import Browser
browser = Browser()
browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal")
browser.select_form(nr=0)
browser['gal_coord'] = ["2"]
browser['eq_alpha'] = ["277.981111"]
browser['eq_delta'] = ["-19.0833"]
response = browser.submit()
content = response.read()
However, I could not test it because it is not available in python 3.
So I tried requests :
import requests
url = 'http://stev.oapd.inaf.it/cgi-bin/trilegal'
values = {'gal_coord':"2",
'eq_alpha':"277.981111",
'eq_delta':"-19.0833",
'field':" 0.047117",
}
r = requests.post(url, files = values)
but I can't figure out how to get to the results page - if I do
r.content
it displays the content of the form that I had just submitted, whereas if you open the actual website, and click 'submit', you see a new window (following the method="post" action="./trilegal_1.6" ).
How can I get to that new window with requests (i.e. follow to the page that opens up when I click the submit button) , and click the link on the results page to retrieve the results file ( "The results will be available after about 2 minutes at THIS LINK.") ?
If you can point me to any other tool that could do the job I would be really grateful - I spent hours looking through SO for something that could help solve this problem.
Thank you!
Chris
Here is working solution for python 2.7
from mechanize import Browser
from urllib import urlretrieve # for download purpose
from bs4 import BeautifulSoup
browser = Browser()
browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal")
browser.select_form(nr=0)
browser['gal_coord'] = ["2"]
browser['eq_alpha'] = ["277.981111"]
browser['eq_delta'] = ["-19.0833"]
response = browser.submit()
content = response.read()
soup = BeautifulSoup(content, 'html.parser')
base_url = 'http://stev.oapd.inaf.it'
# fetch the url from page source and it to base url
link = soup.findAll('a')[0]['href'].split('..')[1]
url = base_url + str(link)
filename = 'test.dat'
# now download the file
urlretrieve(url, filename)
Your file will be downloaded as test.dat. You can open it with respective program.
I post a separate answer because it would be too cluttered. Thanks to #ksai, this works in python 2.7 :
import re
import time
from mechanize import Browser
browser = Browser()
browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal")
browser.select_form(nr=0)
#set appropriate form contents
browser['gal_coord'] = ["2"]
browser['eq_alpha'] = "277.981111"
browser['eq_delta'] = "-19.0833"
browser['field'] = " 0.047117"
browser['photsys_file'] = ["tab_mag_odfnew/tab_mag_lsst.dat"]
browser["icm_lim"] = "3"
browser["mag_lim"] = "24.5"
response = browser.submit()
# wait 1 min while results are prepared
time.sleep(60)
# select the appropriate url
url = 'http://stev.oapd.inaf.it/' + str(browser.links()[0].url[3:])
# download the results file
browser.retrieve(url, 'test1.dat')
Thank you very much!
Chris
So far my code successfully manages to lift the HTML code from the 5 results it picks up when given the name of a subreddit. Now I want to do a search for the the imgur links, whether it's for an album, containing /a/ or a single image. I want to then lift this link and send it to another class (imgurdl)
What is the best way, given my current code?
from bs4 import BeautifulSoup
import praw
from urllib2 import urlopen
import urllib2
import sys
from urlparse import urljoin
import config
import imgurdl
import requests
cache = []
soup = BeautifulSoup
def reddit_login():
r = praw.Reddit(username = USER,
password = config.password,
client_id = config.client_id,
client_secret = config.client_secret,
user_agent = " v0.3"
)
print("***********logged in successfully***********")
return r
def get_category_links(subredditName, r):
print("Grabbing subreddit...")
submissions = r.subreddit(subredditName).hot(limit=5)
print("Grabbing comments...")
#comments = subred.comments(limit = 200)
for submission in submissions:
htmlSource = requests.get(submission.url).text
print (htmlSource)
r = reddit_login()
get_category_links(sys.argv[1], r)
You can get the url from PRAW and then check to see if it is from imgur within the loop itself and then send it to the appropriate function. This way there would be no need to through the html source.
for submission in submissions:
link = submission.url
if "imgur.com/a/" in link:
#Send to imgur album downloader
elif link.endswith(".jpg") or link.endswith(".png"):
#Sent to image downloader
elif "imgur.com/" in link:
#Send to single image imgur downloader