BeautifulSoup timing out with certain URL's? - python

I am brand new to using BeautifulSoup and I am running into an odd issue, likely user error, but I am stumped! I am using BeautifulSoup to parse through a webpage, and return the first a tag with an href attribute. When I use the Wikipedia link, it works as expected! However when I use the BestBuy link, it leads to this timeout...
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
# url = r"https://en.wikipedia.org/wiki/Eastern_Front_(World_War_II)"
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
html_content = urllib.request.urlopen(url)
soup = BeautifulSoup(html_content, 'html.parser')
link = soup.find('a', href=True)
print(link)
Traceback (most recent call last):
File "scrapper.py", line 8, in <module>
html_content = urllib.request.urlopen(url)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1354, in do_open
r = h.getresponse()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 268, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
TimeoutError: [Errno 60] Operation timed out
Do you guys have any insight as to why this might be happening with only certain URL's? Thanks in advance!

You cannot scrape all websites using BeautifulSoap, some websites have restrictions. Best practice is always use headers:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
Output:
<html>
<head>
<title>
Access Denied
</title>
</head>
<body>
<h1>
Access Denied
</h1>
You don't have permission to access "http://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?" on this server.
<p>
Reference #18.9f01d517.1595655333.b833c
</p>
</body>
</html>
You can achieve this task using selenium, follow below steps:
Step 1: Download the web driver for chrome:
First check your chrome version(Browser's Menu(triple vertical dots) -> Help -> About Google Chrome
Step 2: Download Driver from here according to your chrome browser version(mine is 81.0.4044.138)
Step 3: Once downloaded unzip the file and place chromedriver.exe in the directory where your script is.
Step 4: pip install selenium
Now use the below code:
from selenium import webdriver
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
#your website url
site = 'https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255'
#your driver path
driver = webdriver.Chrome(executable_path = 'chromedriver.exe')
#passing website url
driver.get(site)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
link = soup.find('a', href=True)
print(link)
Output:
<a href="https://www.bestbuy.ca/en-CA/home.aspx">
<img alt="Canada" src="https://www.bestbuy.com/~assets/bby/_intl/landing_page/images/maps/canada.svg"/>
<h4>Canada</h4>
</a>

Related

How can I download music files from websites using #Python

How can I download music files from websites using #Python
this code
from bs4 import BeautifulSoup
from requests import *
import urllib
link = input("https://www.chosic.com/free-music/all/")
url = urllib.request.urlopen(link)
content = url.read()
soup = BeautifulSoup(content,'html.parser')
for audio in soup.find_all('audio'):
print(len(audio))
Traceback (most recent call last):
File "C:\Users\pc\Desktop\Downloads files from url using python .py", line 8, in <module>
url = urllib.request.urlopen(link)
File "C:\Program Files\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python39\lib\urllib\request.py", line 501, in open
req = Request(fullurl, data)
File "C:\Program Files\Python39\lib\urllib\request.py", line 320, in __init__
self.full_url = url
File "C:\Program Files\Python39\lib\urllib\request.py", line 346, in full_url
self._parse()
File "C:\Program Files\Python39\lib\urllib\request.py", line 375, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
I want through the website link to extract the links mp3 and wav
Please someone who can help me
You can use next example how to download all mp3 files from that page:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
url = "https://www.chosic.com/free-music/all/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for u in soup.select("[data-url]"):
u = u["data-url"]
print("Downloading {}".format(u))
with open(u.split("/")[-1], "wb") as f_out:
f_out.write(requests.get(u, headers=headers).content)
Prints:
Downloading https://www.chosic.com/wp-content/uploads/2020/06/John_Bartmann_-_09_-_Happy_Clappy-1.mp3
Downloading https://www.chosic.com/wp-content/uploads/2020/11/batchbug-sweet-dreams.mp3
Downloading https://www.chosic.com/wp-content/uploads/2021/01/fm-freemusic-inspiring-optimistic-upbeat-energetic-guitar-rhythm.mp3
Downloading https://www.chosic.com/wp-content/uploads/2021/02/keys-of-moon-white-petals.mp3
...and so on.
and saves the *mp3 files.

How to fix ' raise ValueError("unknown url type: %r" % self.full_url) ValueError: unknown url type: '''

So I have been following along some videos to learn python, but can't get rid of this error. I have experience in other languages so I'm usually fine to fix errors, but no matter what I do, I either get the same error or something different.
I've tried switching the argument from 'xml' to 'lxml', but this only changes the error that I get
from bs4 import BeautifulSoup
import urllib.request
req = urllib.request.urlopen('http://pythonprogramming.net/')
xml = BeautifulSoup(req, 'xml')
for item in xml.findAll('link'):
url = item.text
news = urllib.request.urlopen(url).read()
print(news)
Ideally, this would print out some of the text within link tags, but instead, I get the following errors -
Error while using xml -
File "/Users/rodrigo/Desktop/ALL/Programming/Python/Python Web Programming/Working with HTML/scrapingParagraphData.py", line 13, in <module>
news = urllib.request.urlopen(url).read()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 548, in _open
'unknown_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: #media (min-width>
Error while using lxml -
File "/Users/rodrigo/Desktop/ALL/Programming/Python/Python Web Programming/Working with HTML/scrapingParagraphData.py", line 13, in <module>
news = urllib.request.urlopen(url).read()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 510, in open
req = Request(fullurl, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 328, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 354, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Your current code is targeting link elements and is extracting text rather than href so there is no known protocol to work with.
Even if you extracted the href they are relative so you would still have a problem with unknown protocol.
item['href'] would give:
/static/favicon.ico
/static/css/materialize.min.css
https://fonts.googleapis.com/icon?family=Material+Icons
/static/css/bootstrap.css
I don't think you are after these type of links. If you were after the tutorial links then you need something that targets those elements e.g.
tutorial_links = ['https://pythonprogramming.net' + i['href'] for i in xml.select('.waves-light.btn')]
I would probably rename the assignment variable in BeautifulSoup(req, 'lxml') to:
from bs4 import BeautifulSoup
import urllib.request
req = urllib.request.urlopen('http://pythonprogramming.net/')
soup = BeautifulSoup(req, 'lxml')
tutorial_links = ['https://pythonprogramming.net' + i['href'] for i in xml.select('.waves-light.btn')]

Requests Library

I am new to Python and its available libraries, and I am trying to make a script to scrape a website. I want to read all links on a parent page, and have my script parse out and read data from all children links from the parent page.
For some reason, I am getting this sequence of errors for my code:
python ./scrape.py
/
Traceback (most recent call last):
File "./scrape.py", line 27, in <module>
a = requests.get(url)
File "/Library/Python/2.7/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/Library/Python/2.7/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Python/2.7/site-packages/requests/sessions.py", line 494, in request
prep = self.prepare_request(req)
File "/Library/Python/2.7/site-packages/requests/sessions.py", line 437, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/Library/Python/2.7/site-packages/requests/models.py", line 305, in prepare
self.prepare_url(url, params)
File "/Library/Python/2.7/site-packages/requests/models.py", line 379, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/': No schema supplied. Perhaps you meant http:///?
From my Python script here:
from bs4 import BeautifulSoup
import requests
#somesite = 'https://www.somesite.com/"
page = 'https://www.investopedia.com/terms/s/stop-limitorder.asp'
count = 0
#url = raw_input("Enter a website to extract the URL's from: ")
r = requests.get(page) #requests html document
data = r.text #set data = to html text
soup = BeautifulSoup(data, "html.parser") #parse data with BS
#count = 0;
#souplist = []
#list
A = []
#loop to seach for all <a> tags that hold urls, store page data in array
for link in soup.find_all('a'):
#print(link.get('href'))
url = link.get('href')
print(url)
a = requests.get(url)
#a = requests.get(url)
#data1 = a.text
#souplist.insert(0, BeautifulSoup[data1])
#++count
#
#for link in soup.find_all('p'):
#print(link.getText())
Some of the links the page your are scraping are relative URLs to the website(https://www.investopedia.com) . So you may have to crawl such URLs by adding the site.
from urlparse import urlparse, urljoin
# Python 3
# from urllib.parse import urlparse
# from urllib.parse import urljoin
site = urlparse(page).scheme + "://" + urlparse(page).netloc
for link in soup.find_all('a'):
#print(link.get('href'))
url = link.get('href')
if not urlparse(url).scheme:
url = urljoin(site, url)
print(url)
a = requests.get(url)

Python Html table can't find data when running on server

Hi my code won't work when actually running online, it returns None when i use Find how can i fix this?
This is my code;
import time
import sys
import urllib
import re
from bs4 import BeautifulSoup, NavigableString
print "Initializing Python Script"
print "The passed arguments are "
urls = ["http://tweakers.net/pricewatch/355474/gigabyte-gv-n78toc-3g/specificaties/", "http://tweakers.net/pricewatch/328943/sapphire-radeon-hd-7950-3gb-gddr5-with-boosts/specificaties/", "https://www.alternate.nl/GIGABYTE/GV-N78TOC-3GD-grafische-kaart/html/product/1115798", "http://tweakers.net/pricewatch/320116/raspberry-pi-model-b-(512mb)/specificaties/"]
i =0
regex = '<title>(.+?)</title>'
pattern = re.compile(regex)
word = "tweakers"
alternate = "alternate"
while i<len(urls):
dataraw = urllib.urlopen(urls[i])
data = dataraw.read()
soup = BeautifulSoup(data)
table = soup.find("table", {"class" : "spec-detail"})
print table
i+=1
Here is the outcome:
Initializing Python Script
The passed arguments are
None
None
None
None
Script finalized
i have tried using findAll and other methods.. But i don't seem to understand why it is working on my Command line but not on the server itself...
Any help?
Edit
Traceback (most recent call last):
File "python_script.py", line 35, in
soup = BeautifulSoup(urllib2.urlopen(url), 'html.parser')
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 444, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
I'm suspecting you are experiencing the differences between parsers.
Specifying the parser explicitly works for me:
import urllib2
from bs4 import BeautifulSoup
urls = ["http://tweakers.net/pricewatch/355474/gigabyte-gv-n78toc-3g/specificaties/",
"http://tweakers.net/pricewatch/328943/sapphire-radeon-hd-7950-3gb-gddr5-with-boosts/specificaties/",
"https://www.alternate.nl/GIGABYTE/GV-N78TOC-3GD-grafische-kaart/html/product/1115798",
"http://tweakers.net/pricewatch/320116/raspberry-pi-model-b-(512mb)/specificaties/"]
for url in urls:
soup = BeautifulSoup(urllib2.urlopen(url), 'html.parser')
table = soup.find("table", {"class": "spec-detail"})
print table
In this case, I'm using html.parser, but you can play around and specify lxml or html5lib, for example.
Note that the third url doesn't contain a table with class="spec-detail" and, therefore, it prints None for it.
I've also introduced few improvements:
removed unused imports
replaced a while loop with indexing with a nice for loop
removed unrelated code
replaced urllib with urllib2
You can also use requests module and set appropriate User-Agent header pretending to be a real browser:
from bs4 import BeautifulSoup
import requests
urls = ["http://tweakers.net/pricewatch/355474/gigabyte-gv-n78toc-3g/specificaties/",
"http://tweakers.net/pricewatch/328943/sapphire-radeon-hd-7950-3gb-gddr5-with-boosts/specificaties/",
"https://www.alternate.nl/GIGABYTE/GV-N78TOC-3GD-grafische-kaart/html/product/1115798",
"http://tweakers.net/pricewatch/320116/raspberry-pi-model-b-(512mb)/specificaties/"]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'}
for url in urls:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find("table", {"class": "spec-detail"})
print table
Hope that helps.

Python getting all links from a google search result page

i want to create a script that returns all the urls found in a page a google for example , so i create this script : (using BeautifulSoup)
import urllib2
from BeautifulSoup import BeautifulSoup
page = urllib2.urlopen("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.read())
links = soup.findAll("a")
for link in links:
print link["href"]
and it return this 403 forbidden result :
Traceback (most recent call last):
File "C:\Python27\sql\sql.py", line 3, in <module>
page = urllib2.urlopen("https://www.google.dz/search?q=see")
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 438, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
any idea to avoid this error or another methode to get the urls from of the search result ?
No problem using requests
import requests
from BeautifulSoup import BeautifulSoup
page = requests.get("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.content)
links = soup.findAll("a")
Some of the links have links are like search%:http:// where the end of one joins another so we need to split then using re
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.content)
import re
links = soup.findAll("a")
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
print re.split(":(?=http)",link["href"].replace("/url?q=",""))
['https://www.see.asso.fr/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBIQFjAA&usg=AFQjCNF2_I8jB98JwR3jcKniLZekSrRO7Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:f7M8NX1XmDsJ', 'https://www.see.asso.fr/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBUQIDAA&usg=AFQjCNF8WJButjMNXQXvXBbtyXnF1SgiOg']
['https://www.see.asso.fr/3ei&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBgQ0gIoADAA&usg=AFQjCNGnPL1RiX5TekI_yMUc-w_f2oVXtw']
['https://www.see.asso.fr/node/9587&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBkQ0gIoATAA&usg=AFQjCNHX-6AzBgLQUF0s8TxFcZjIhxz_Hw']
['https://www.see.asso.fr/ree&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBoQ0gIoAjAA&usg=AFQjCNGkkd8e1JjiNrhSM4HQYE-M6g6j-w']
['https://www.see.asso.fr/node/130&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CBsQ0gIoAzAA&usg=AFQjCNEkVdpcbXDz5-cV9u2NNYoV6aM8VA']
['http://www.wordreference.com/enfr/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CB0QFjAB&usg=AFQjCNHQGwcsGpro26dhxFP6q-fQvwbB0Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:ooK-I_HuCkwJ', 'http://www.wordreference.com/enfr/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCAQIDAB&usg=AFQjCNFRlV5Zv_n48Wivr4LeOkTQsA0D1Q']
['http://fr.wikipedia.org/wiki/S%25C3%25A9e&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCMQFjAC&usg=AFQjCNGmtqmcXPqYZ_nwa0RWL0uYf5PMJw']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:GjcgkyzsUigJ', 'http://fr.wikipedia.org/wiki/S%2525C3%2525A9e%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCYQIDAC&usg=AFQjCNHesOIBU3OXBspARcONbK_k_8-gnw']
['http://fr.wikipedia.org/wiki/Camille_S%25C3%25A9e&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCkQFjAD&usg=AFQjCNGO-WIDl4TrBeo88WY9QsopWmsMyQ']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:izhQjC85nOoJ', 'http://fr.wikipedia.org/wiki/Camille_S%2525C3%2525A9e%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CCwQIDAD&usg=AFQjCNEfcIKsKbf026xgWT7NkrAueZvL0A']
['http://de.wikipedia.org/wiki/Zugersee&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDEQ9QEwBA&usg=AFQjCNHpfJW5-XdsgpFUSP-jEmHjXQUWHQ']
['http://commons.wikimedia.org/wiki/File:Champex_See.jpg&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDMQ9QEwBQ&usg=AFQjCNEordFWr2QIaob45WlR5Yi-ZvZSiA']
['http://www.all-free-photos.com/show/showphotop.php%3Fidtop%3D4%26lang%3Dfr&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDUQ9QEwBg&usg=AFQjCNEC24FOIE5cvF4zmEDgq5-5xubM3w']
['http://www.allbestwallpapers.com/travel-zell_am_see,_kaprun,_austria_wallpapers.html&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDcQ9QEwBw&usg=AFQjCNFkzMZDuthZHvnF-JvyksNUqjt1dQ']
['http://www.see-swe.org/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDkQFjAI&usg=AFQjCNF1zbcLfjanxgCXtHoOQXOdMgh_AQ']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:lzh6JxvKUTIJ', 'http://www.see-swe.org/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CDwQIDAI&usg=AFQjCNFYN6tzzVaHsAc5aOvYNql3Zy4m3A']
['http://fr.wiktionary.org/wiki/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CD8QFjAJ&usg=AFQjCNFWYIGc1gj0prytowzqI-0LDFRvZA']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:G9v8lXWRCyQJ', 'http://fr.wiktionary.org/wiki/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEIQIDAJ&usg=AFQjCNENzi4E1n-9qHYsNahY6lQzaW5Xvg']
['http://en.wiktionary.org/wiki/see&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEUQFjAK&usg=AFQjCNECGZjw-rBUALO43WaTh2yB9BUhDg']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:ywc4URuPdIQJ', 'http://en.wiktionary.org/wiki/see%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEgQIDAK&usg=AFQjCNE0pykIqXXRl08E-uTtoj03QEpnbg']
['http://see-concept.com/&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CEsQFjAL&usg=AFQjCNGFWjhiH7dEBhITJt01ob_JENlz1Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:jHTkOVEoRsAJ', 'http://see-concept.com/%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CE4QIDAL&usg=AFQjCNECPgxt9ZSFmZzK_ker9Hw_FoCi_A']
['http://www.theconjugator.com/la/conjugaison/du/verbe/see.html&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFEQFjAM&usg=AFQjCNETCTQ0vPDIdV_2Q57qq11dyN0d8Q']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:xD7_Qo7roS8J', 'http://www.theconjugator.com/la/conjugaison/du/verbe/see.html%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFQQIDAM&usg=AFQjCNF_hBCyDZncivYGnL7je5kYme9hEg']
['http://www.zellamsee-kaprun.com/fr&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFcQFjAN&usg=AFQjCNFVDeBWrZMDSjK9jKYF4AQlIXa9lA']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:BFBEUp05w7YJ', 'http://www.zellamsee-kaprun.com/fr%252Bsee%26hl%3Dfr%26%26ct%3Dclnk&sa=U&ei=ryv6U6PvEKzA7AaB4ICwCA&ved=0CFoQIDAN&usg=AFQjCNHtrOeEpYWqvT3f0M1p-gxUkYT1IA']
The best way to do this is to use the google API (pip install google)
GeeksforGeeks writes about it here
from googlesearch import search
# to search
query = "see"
links = []
for j in search(query, tld="co.in", num=10, stop=10, pause=2):
links.append(j)
import urllib.request
from BeautifulSoup import BeautifulSoup
page = urllib.request.urlopen("https://www.google.dz/search?q=see")
soup = BeautifulSoup(page.read())
links = soup.findAll("a")
for link in links:
print link["href"]

Categories