I am new to Python and its available libraries, and I am trying to make a script to scrape a website. I want to read all links on a parent page, and have my script parse out and read data from all children links from the parent page.
For some reason, I am getting this sequence of errors for my code:
python ./scrape.py
/
Traceback (most recent call last):
File "./scrape.py", line 27, in <module>
a = requests.get(url)
File "/Library/Python/2.7/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Library/Python/2.7/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Python/2.7/site-packages/requests/sessions.py", line 494, in request
prep = self.prepare_request(req)
File "/Library/Python/2.7/site-packages/requests/sessions.py", line 437, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/Library/Python/2.7/site-packages/requests/models.py", line 305, in prepare
self.prepare_url(url, params)
File "/Library/Python/2.7/site-packages/requests/models.py", line 379, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/': No schema supplied. Perhaps you meant http:///?
From my Python script here:
from bs4 import BeautifulSoup
import requests
#somesite = 'https://www.somesite.com/"
page = 'https://www.investopedia.com/terms/s/stop-limitorder.asp'
count = 0
#url = raw_input("Enter a website to extract the URL's from: ")
r = requests.get(page) #requests html document
data = r.text #set data = to html text
soup = BeautifulSoup(data, "html.parser") #parse data with BS
#count = 0;
#souplist = []
#list
A = []
#loop to seach for all <a> tags that hold urls, store page data in array
for link in soup.find_all('a'):
#print(link.get('href'))
url = link.get('href')
print(url)
a = requests.get(url)
#a = requests.get(url)
#data1 = a.text
#souplist.insert(0, BeautifulSoup[data1])
#++count
#
#for link in soup.find_all('p'):
#print(link.getText())
Some of the links the page your are scraping are relative URLs to the website(https://www.investopedia.com) . So you may have to crawl such URLs by adding the site.
from urlparse import urlparse, urljoin
# Python 3
# from urllib.parse import urlparse
# from urllib.parse import urljoin
site = urlparse(page).scheme + "://" + urlparse(page).netloc
for link in soup.find_all('a'):
#print(link.get('href'))
url = link.get('href')
if not urlparse(url).scheme:
url = urljoin(site, url)
print(url)
a = requests.get(url)
Related
I am a new python user. I'm getting the following error when trying to run my code in PythonAnywhere, despite it working fine on my local PC.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zachfeatherstone/imputations.py", line 7, in <module>
html = urlopen(url).read()
File "/usr/local/lib/python3.9/urllib/request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/lib/python3.9/urllib/request.py", line 517, in open
response = self._open(req, data)
File "/usr/local/lib/python3.9/urllib/request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/local/lib/python3.9/urllib/request.py", line 494, in _call_chain
result = func(*args)
File "/usr/local/lib/python3.9/urllib/request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/local/lib/python3.9/urllib/request.py", line 1349, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error Tunnel connection failed: 403 Forbidden>
It's similar to: urllib.request.urlopen: ValueError: unknown url type.
CODE
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os.path
url = input("Enter the URL you want to analyse: ")
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
wordList = text.split()
#imputations
imputations = []
if "unprofessional" in wordList:
unprofessional_imputation = "That our Client is unprofessional."
imputations.append(unprofessional_imputation)
print(imputations)
#print(wordList)
#print(text)
#saving file
save_path = 'C:/Users/team/downloads'
name_of_file = input("What do you want to save the file as? ")
completeName = os.path.join(save_path, name_of_file+".txt")
f = open(completeName, "w")
# traverse paragraphs from soup
for words in imputations:
f.write(words)
f.write("\n")
My apologies if this has been answered before. How do I manage to run this in PythonAnywhere so that I can deploy over the web?
It COULD be helpful to send header info along with your request. You can pass a Request object into your request like this:
url = input("Enter the URL you want to analyse: ")
header = {'User-Agent': 'Gandalf'}
req = urllib.request.Request(url, None, header)
html = urllib.request.urlopen(req)
html = html.read()
soup = BeautifulSoup(html, features="html.parser")
I'm just starting/learning to use the Google Cloud platform (functions in particular) and I wrote a simple python scraper using BeautifulSoup that is returning an error and I can't figure out why.
from bs4 import BeautifulSoup
import requests
def hello_world(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
url = 'https://example.com/'
req = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
html = req.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
print(title)
return title
When I print the title of the scraped page, that shows up in the logs fine. When I return the variable though, the logs report an "IndexError: list index out of range". When I return soup.prettify() it also works fine.
This is the Traceback that I get in the GCP logs
Traceback (most recent call last): File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 2447, in wsgi_app response = self.full_dispatch_request() File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 1953, in full_dispatch_request return self.finalize_request(rv) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 1968, in finalize_request response = self.make_response(rv) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 2117, in make_response rv = self.response_class.force_type(rv, request.environ) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type response = BaseResponse(*_run_wsgi_app(response, environ)) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/wrappers/base_response.py", line 26, in _run_wsgi_app return _run_wsgi_app(*args) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/test.py", line 1123, in run_wsgi_app return app_iter, response[0], Headers(response[1]) IndexError: list index out of range
The problem is probably caused by wrong indentation.
By the way try with this code, maybe it easier to undersand:
from bs4 import BeautifulSoup
import requests
url = 'https://stackoverflow.com'
def titleScaper(url):
req = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"})
soup = BeautifulSoup(req.content, 'html.parser')
soup.encode('utf-8')
return soup.title.get_text()
title = titleScaper(url)
print(title)
I am brand new to using BeautifulSoup and I am running into an odd issue, likely user error, but I am stumped! I am using BeautifulSoup to parse through a webpage, and return the first a tag with an href attribute. When I use the Wikipedia link, it works as expected! However when I use the BestBuy link, it leads to this timeout...
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
# url = r"https://en.wikipedia.org/wiki/Eastern_Front_(World_War_II)"
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
html_content = urllib.request.urlopen(url)
soup = BeautifulSoup(html_content, 'html.parser')
link = soup.find('a', href=True)
print(link)
Traceback (most recent call last):
File "scrapper.py", line 8, in <module>
html_content = urllib.request.urlopen(url)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1354, in do_open
r = h.getresponse()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 268, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
TimeoutError: [Errno 60] Operation timed out
Do you guys have any insight as to why this might be happening with only certain URL's? Thanks in advance!
You cannot scrape all websites using BeautifulSoap, some websites have restrictions. Best practice is always use headers:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
Output:
<html>
<head>
<title>
Access Denied
</title>
</head>
<body>
<h1>
Access Denied
</h1>
You don't have permission to access "http://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?" on this server.
<p>
Reference #18.9f01d517.1595655333.b833c
</p>
</body>
</html>
You can achieve this task using selenium, follow below steps:
Step 1: Download the web driver for chrome:
First check your chrome version(Browser's Menu(triple vertical dots) -> Help -> About Google Chrome
Step 2: Download Driver from here according to your chrome browser version(mine is 81.0.4044.138)
Step 3: Once downloaded unzip the file and place chromedriver.exe in the directory where your script is.
Step 4: pip install selenium
Now use the below code:
from selenium import webdriver
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
#your website url
site = 'https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255'
#your driver path
driver = webdriver.Chrome(executable_path = 'chromedriver.exe')
#passing website url
driver.get(site)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
link = soup.find('a', href=True)
print(link)
Output:
<a href="https://www.bestbuy.ca/en-CA/home.aspx">
<img alt="Canada" src="https://www.bestbuy.com/~assets/bby/_intl/landing_page/images/maps/canada.svg"/>
<h4>Canada</h4>
</a>
I am new in web scraping, and I ma having a few difficulties using beautifulsoup, which seems more related to installation than to the code itself. I have installed bs4, and want to get data from webpages. I started with a simple exercise as follows:
import requests
import urllib
from BeautifulSoup import BeautifulSoup
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
which gets me the following error message
Traceback (most recent call last):
File "<ipython-input-62-a9912850b0dc>", line 1, in <module>
soup = BeautifulSoup(page.content, 'html.parser')
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1522, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1147, in __init__
self._feed(isHTML=isHTML)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1189, in _feed
SGMLParser.feed(self, markup)
File "/Users/../anaconda/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/Users/../anaconda/lib/python2.7/sgmllib.py", line 174, in goahead
k = self.parse_declaration(i)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1463, in parse_declaration
j = SGMLParser.parse_declaration(self, i)
File "/Users/../anaconda/lib/python2.7/markupbase.py", line 109, in parse_declaration
self.handle_decl(data)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1448, in handle_decl
self._toStringSubclass(data, Declaration)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1381, in _toStringSubclass
self.endData(subclass)
File "/Users/../anaconda/lib/python2.7/site-packages/BeautifulSoup.py", line 1251, in endData
(not self.parseOnlyThese.text or \
AttributeError: 'str' object has no attribute 'text'
If I remove 'html.parser' and use
soup = BeautifulSoup(page.content)
the code works, but, of course, it does not give me what I need.
Any clues as to how to solve this? I am in a OSX El Capitan, and use spyder as editor. I did re-installed bs4 a few times.
Thanks
You are using an old version of BeautifulSoup. Please uninstall it, and then install BeautifulSoup4, with pip install BeautifulSoup4; and then adjust your code thus:
import requests
from bs4 import BeautifulSoup
r = requests.get('http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168')
s = BeautifulSoup(r.content, 'html.parser')
Try this snippet:
soup = BeautifulSoup(page.content, 'html.parser')
# in place of page.content use page.text
soup = BeautifulSoup(page.text, 'html.parser')
i want to create a script that returns all the urls found in a page a google for example , so i create this script : (using BeautifulSoup)
import urllib2
from BeautifulSoup import BeautifulSoup
page = urllib2.urlopen("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.read())
links = soup.findAll("a")
for link in links:
print link["href"]
and it return this 403 forbidden result :
Traceback (most recent call last):
File "C:\Python27\sql\sql.py", line 3, in <module>
page = urllib2.urlopen("https://www.google.dz/search?q=see")
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 438, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
any idea to avoid this error or another methode to get the urls from of the search result ?
No problem using requests
import requests
from BeautifulSoup import BeautifulSoup
page = requests.get("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.content)
links = soup.findAll("a")
Some of the links have links are like search%:http:// where the end of one joins another so we need to split then using re
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.content)
import re
links = soup.findAll("a")
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
print re.split(":(?=http)",link["href"].replace("/url?q=",""))
['https://www.see.asso.fr/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBIQFjAA&usg=AFQjCNF2_I8jB98JwR3jcKniLZekSrRO7Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:f7M8NX1XmDsJ', 'https://www.see.asso.fr/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBUQIDAA&usg=AFQjCNF8WJButjMNXQXvXBbtyXnF1SgiOg']
['https://www.see.asso.fr/3ei&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBgQ0gIoADAA&usg=AFQjCNGnPL1RiX5TekI_yMUc-w_f2oVXtw']
['https://www.see.asso.fr/node/9587&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBkQ0gIoATAA&usg=AFQjCNHX-6AzBgLQUF0s8TxFcZjIhxz_Hw']
['https://www.see.asso.fr/ree&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBoQ0gIoAjAA&usg=AFQjCNGkkd8e1JjiNrhSM4HQYE-M6g6j-w']
['https://www.see.asso.fr/node/130&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBsQ0gIoAzAA&usg=AFQjCNEkVdpcbXDz5-cV9u2NNYoV6aM8VA']
['http://www.wordreference.com/enfr/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CB0QFjAB&usg=AFQjCNHQGwcsGpro26dhxFP6q-fQvwbB0Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:ooK-I_HuCkwJ', 'http://www.wordreference.com/enfr/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCAQIDAB&usg=AFQjCNFRlV5Zv_n48Wivr4LeOkTQsA0D1Q']
['http://fr.wikipedia.org/wiki/S%25C3%25A9e&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCMQFjAC&usg=AFQjCNGmtqmcXPqYZ_nwa0RWL0uYf5PMJw']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:GjcgkyzsUigJ', 'http://fr.wikipedia.org/wiki/S%2525C3%2525A9e%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCYQIDAC&usg=AFQjCNHesOIBU3OXBspARcONbK_k_8-gnw']
['http://fr.wikipedia.org/wiki/Camille_S%25C3%25A9e&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCkQFjAD&usg=AFQjCNGO-WIDl4TrBeo88WY9QsopWmsMyQ']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:izhQjC85nOoJ', 'http://fr.wikipedia.org/wiki/Camille_S%2525C3%2525A9e%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCwQIDAD&usg=AFQjCNEfcIKsKbf026xgWT7NkrAueZvL0A']
['http://de.wikipedia.org/wiki/Zugersee&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDEQ9QEwBA&usg=AFQjCNHpfJW5-XdsgpFUSP-jEmHjXQUWHQ']
['http://commons.wikimedia.org/wiki/File:Champex_See.jpg&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDMQ9QEwBQ&usg=AFQjCNEordFWr2QIaob45WlR5Yi-ZvZSiA']
['http://www.all-free-photos.com/show/showphotop.php%3Fidtop%3D4%26lang%3Dfr&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDUQ9QEwBg&usg=AFQjCNEC24FOIE5cvF4zmEDgq5-5xubM3w']
['http://www.allbestwallpapers.com/travel-zell_am_see,_kaprun,_austria_wallpapers.html&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDcQ9QEwBw&usg=AFQjCNFkzMZDuthZHvnF-JvyksNUqjt1dQ']
['http://www.see-swe.org/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDkQFjAI&usg=AFQjCNF1zbcLfjanxgCXtHoOQXOdMgh_AQ']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:lzh6JxvKUTIJ', 'http://www.see-swe.org/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDwQIDAI&usg=AFQjCNFYN6tzzVaHsAc5aOvYNql3Zy4m3A']
['http://fr.wiktionary.org/wiki/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CD8QFjAJ&usg=AFQjCNFWYIGc1gj0prytowzqI-0LDFRvZA']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:G9v8lXWRCyQJ', 'http://fr.wiktionary.org/wiki/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEIQIDAJ&usg=AFQjCNENzi4E1n-9qHYsNahY6lQzaW5Xvg']
['http://en.wiktionary.org/wiki/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEUQFjAK&usg=AFQjCNECGZjw-rBUALO43WaTh2yB9BUhDg']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:ywc4URuPdIQJ', 'http://en.wiktionary.org/wiki/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEgQIDAK&usg=AFQjCNE0pykIqXXRl08E-uTtoj03QEpnbg']
['http://see-concept.com/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEsQFjAL&usg=AFQjCNGFWjhiH7dEBhITJt01ob_JENlz1Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:jHTkOVEoRsAJ', 'http://see-concept.com/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CE4QIDAL&usg=AFQjCNECPgxt9ZSFmZzK_ker9Hw_FoCi_A']
['http://www.theconjugator.com/la/conjugaison/du/verbe/see.html&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFEQFjAM&usg=AFQjCNETCTQ0vPDIdV_2Q57qq11dyN0d8Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:xD7_Qo7roS8J', 'http://www.theconjugator.com/la/conjugaison/du/verbe/see.html%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFQQIDAM&usg=AFQjCNF_hBCyDZncivYGnL7je5kYme9hEg']
['http://www.zellamsee-kaprun.com/fr&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFcQFjAN&usg=AFQjCNFVDeBWrZMDSjK9jKYF4AQlIXa9lA']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:BFBEUp05w7YJ', 'http://www.zellamsee-kaprun.com/fr%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFoQIDAN&usg=AFQjCNHtrOeEpYWqvT3f0M1p-gxUkYT1IA']
The best way to do this is to use the google API (pip install google)
GeeksforGeeks writes about it here
from googlesearch import search
# to search
query = "see"
links = []
for j in search(query, tld="co.in", num=10, stop=10, pause=2):
links.append(j)
import urllib.request
from BeautifulSoup import BeautifulSoup
page = urllib.request.urlopen("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.read())
links = soup.findAll("a")
for link in links:
print link["href"]