Error while trying to loop through webpages for datascraping

Error while trying to loop through webpages for datascraping - python

I have written the code to extract the data from the first page, but am running into problems when trying to extract from all pages.
This is my code to extract data from page 'a'
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
from string import ascii_lowercase
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
playerdatasaved = ""
soup = make_soup('https://www.basketball-reference.com/players/a/')
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
print(playerdatasaved)
header = "player, from, to, position, height, weight, dob, year,
colleges"+"\n"
file = open(os.path.expanduser("basketballstats.csv"),"wb")
file.write(bytes(header, encoding = "ascii", errors = "ignore"))
file.write(bytes(playerdatasaved[1:], encoding = "ascii", errors = "ignore"))
Now to loop through pages, my logic is this code
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
from string import ascii_lowercase
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
playerdatasaved = ""
for letter in ascii_lowercase:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header = "player, from, to, position, height, weight, dob, year,
colleges"+"\n"
file = open(os.path.expanduser("basketball.csv"),"wb")
file.write(bytes(header, encoding = "ascii", errors = "ignore"))
file.write(bytes(playerdatasaved[1:], encoding = "ascii", errors = "ignore"))
However, this is running into an error relating to the line:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")

I tried to run your code and ran into a ssl certificate error CERTIFICATE_VERIFY_FAILED which seems to be a problem with the wesite you are trying to scrape and not your code.
Maybe this stack can help clear things:
"SSL: certificate_verify_failed" error when scraping https://www.thenewboston.com/

for letter in ascii_lowercase:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")
In the url you provided, you are encountering a 404 error when letter = 'x'. Looks like that player index does not exist, make sure you check for that case when going through the letters.

Agreed with Eman. The page for x is not available. Just use a try-catch blog to ignore that page.
try:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
except Exception as e:
print(e)

To fix your code, first thing we have to do is to turn ascii_lowercase into string so we can run soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/") without major exceptions. Just change your first for to this: for letter in str(ascii_lowercase):.
Next thing is to treat exceptions when we cannot find a page. For example, "https://www.basketball-reference.com/players/x/" does not not exist. For that, we can use try, exception.
And last, but not least, you have to ignore the first line of the table, otherwise you will have lots of Player,From,To,Pos,Ht,Wt,Birth,Date,Colleges in your file. So, do this:
for table in soup.findAll("tbody"):
for record in table.findAll("tr"):
Instead of this:
for record in soup.findAll("tr"):
Here is the whole thing working:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
from string import ascii_lowercase
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
playerdatasaved = ""
for letter in str(ascii_lowercase):
print(letter) # I added this to see the magic happening
try:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
except:
pass
header = "player, from, to, position, height, weight, dob, year,colleges"+"\n"
file = open(os.path.expanduser("basketball.csv"),"wb")
file.write(bytes(header, encoding = "ascii", errors = "ignore"))
file.write(bytes(playerdatasaved[1:], encoding = "ascii", errors = "ignore"))

Related

ValueError: I/O operation on closed file. WebScraping BeatutifulSoup/Python

I am trying to save results in .csv but a receive the follow message and I have no idea how to fix that:
f.write(linha_csv)
ValueError: I/O operation on closed file.
Code Bellow:
import requests
from bs4 import BeautifulSoup
import csv
from csv import reader, writer
url_base = "https://lista.mercadolivre.com.br/"
soup = BeautifulSoup(requests.get(url_base + produto_nome).content,
"html.parser")
produtos = soup.findAll('div', attrs =
{'class': 'andes-card andes-card--flat andes-card--default ui-
search-result ui-search-result--core andes-card--padding-default'}
)
with open
(r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='')
as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
i = 1
while True:
for tag in soup:
titulo = soup.find('h2', attrs={'class': 'ui-search-
item__title'})
print(i, tag.text)
print(i,'Título do Produto:', titulo.text)
print(i,'Link do Produto:', link['href'])
next_link = soup.select_one( "a.andes-pagination__link:-soup-
contains(Seguinte)"
)
if not next_link: break
linha_csv = titulo.text + ';' + link['href'] + ';' + "R$" +
real.text + "," + centavos.text + '\n'
f.write(linha_csv)

Cause indentation in your question is not set correct, it may caused by that fact. Moving the writing part into your for-loop should fix the issue:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
Example
import requests, csv
from bs4 import BeautifulSoup
url_base = "https://lista.mercadolivre.com.br/"
query = "vinho"
soup = BeautifulSoup(requests.get(url_base + query).content, "html.parser")
with open (r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='') as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
while True:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
next_link = soup.select_one( "a.andes-pagination__link:-soup-contains(Seguinte)")
if not next_link:
break
soup = BeautifulSoup(requests.get(next_link["href"]).content, "html.parser")

Web Scraping - ResultSet object has no attribute 'findAll'

Having an issue with bs4 when reading second value in array within a for loop. Below I will paste the code.
However, when I use line #19, I receive no errors. When I swap it out for the entire array (line #18), It errors out when it attempts to gather the second value. Note that the second value in the array is the same value as line #19.
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
SmartLiving_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
Headers = "Description, URL, Price \n"
text_file = open("HayneedlePrices.csv", "w")
text_file.write(Headers)
text_file.close()
URL_Array = [SmartLiving_IDS, IEL_IDS, TD_IDS]
#URL_Array = [IEL_IDS]
for URL in URL_Array:
print("\n" + "Loading New URL:" "\n" + URL + "\n" + "\n")
uClient = uReq(URL)
page_html = uClient.read()
uClient.close()
soup = soup(page_html, "html.parser")
Containers = soup.findAll("div", {"product-card__container___1U2Sb"})
for Container in Containers:
Title = Container.div.img["alt"]
Product_URL = Container.a["href"]
Price_Container = Container.findAll("div", {"class":"product-card__productInfo___30YSc body no-underline txt-black"})[0].findAll("span", {"style":"font-size:20px"})
Price_Dollars = Price_Container[0].get_text()
Price_Cents = Price_Container[1].get_text()
print("\n" + "#####################################################################################################################################################################################################" + "\n")
# print(" Container: " + "\n" + str(Container))
# print("\n" + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" + "\n")
print(" Description: " + str(Title))
print(" Product URL: " + str(Product_URL))
print(" Price: " + str(Price_Dollars) + str(Price_Cents))
print("\n" + "#####################################################################################################################################################################################################" + "\n")
text_file = open("HayneedlePrices.csv", "a")
text_file.write(str(Title) + ", " + str(Product_URL) + ", " + str(Price_Dollars) + str(Price_Cents) + "\n")
text_file.close()
print("Information gathered and Saved from URL Successfully.")
print("Looking for Next URL..")
print("No Additional URLs to Gather. Process Completed.")

The problem is that you import BeautifulSoup as soup and also define a variable soup = soup(page_html, "html.parser") with the same name!
I refactored your code a bit, let me know if it works as expected!
import csv
import requests
from bs4 import BeautifulSoup
smart_living_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
site_URLs = [smart_living_IDS, IEL_IDS, TD_IDS]
sess = requests.Session()
prod_data = []
for curr_URL in site_URLs:
req = sess.get(url=curr_URL)
soup = BeautifulSoup(req.content, "lxml")
containers = soup.find_all("div", {"product-card__container___1U2Sb"})
for curr_container in containers:
prod_title = curr_container.div.img["alt"]
prod_URL = curr_container.a["href"]
price_container = curr_container.find(
"div",
{"class": "product-card__productInfo___30YSc body no-underline txt-black"},
)
dollars_elem = price_container.find("span", {"class": "main-price-dollars"})
cents_elem = dollars_elem.find_next("span")
prod_price = dollars_elem.get_text() + cents_elem.get_text()
prod_price = float(prod_price[1:])
prod_data.append((prod_title, prod_URL, prod_price))
CSV_headers = ("title", "URL", "price")
with open("../out/hayneedle_prices.csv", "w", newline="") as file_out:
writer = csv.writer(file_out)
writer.writerow(CSV_headers)
writer.writerows(prod_data)
I tested it by repeating the current URL list 10 times, it took longer than I was anticipating. There are certainly improvements to be made, I might rewrite it to use lxml in the next few days, and multiprocessing might also be a good option. It all depends on how you're using this, of course :)

Crawling output into multiple csv files

I would like to know how to export my results from crawling into multiple csv files for each different city that I have crawled. Somehow I´m running into walls, do not get a proper way to sort it out.
That is my code:
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
output_file= open("TA.csv", "w", newline='')
RegionIDArray = [187147,187323,186338]
dict = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
print("POI: " + str(item) + " | " + "Location: " + str(dict[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
writer.writerow([str(item), str(dict[reg])])
My goal would be that I get three sperate CSV files for Paris, Berlin and London instead of getting all the results in one big csv file.
Could you guys help me out? Thanks for your feedback:)

I did some minor modifications to your code. To make files for each locale, I moved the out_file name inside the loop.
Note, that I don't have time now, the very last line is a hack to ignore unicode errors -- it just skips trying to output a line with a non ascii character. Thas isn't good. Maybe someone can fix that part?
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
RegionIDArray = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
output_file= open("TA" + str(reg) + ".csv", "w")
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
# print("POI: " + str(item) + " | " + "Location: " + str(RegionIDArray[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
try:
writer.writerow([str(item), str(RegionIDArray[reg])])
except:
pass

Opening page using urllib2 - diacritics

I'm trying to open multiple pages using urllib2. The problem is that some pages can't be opened. It returns urllib2.HTTPerror: HTTP Error 400: Bad Request
I'm getting hrefs of this pages from another web page (in the head of the page is charset = "utf-8").
The error is returned only then, when I'm trying to open a page containing 'č','ž' or 'ř' in url.
Here is the code:
def getSoup(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
soup = BeautifulSoup(page, 'html.parser')
return soup
hovienko = getSoup("http://www.hovno.cz/hovna-az/a/1/")
lis = hovienko.find("div", class_="span12").find('ul').findAll('li')
for liTag in lis:
aTag = liTag.find('a')['href']
href = "http://www.hovno.cz"+aTag """ hrefs, I'm trying to open using urllib2 """
soup = getSoup(href.encode("iso-8859-2")) """ here occures errors when 'č','ž' or 'ř' in url """
Do anybody knows, what I have to do to avoid errors?
Thank you

This site is UTF-8. Why you need href.encode("iso-8859-2") ? I have taken the next code from http://programming-review.com/beautifulsoasome-interesting-python-functions/
import urllib2
import cgitb
cgitb.enable()
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
# print all links
def PrintLinks(localurl):
data = urllib2.urlopen(localurl).read()
print 'Encoding of fetched HTML : %s', type(data)
soup = BeautifulSoup(data)
parse = urlparse(localurl)
localurl = parse[0] + "://" + parse[1]
print "<h3>Page links statistics</h3>"
l = soup.findAll("a", attrs={"href":True})
print "<h4>Total links count = " + str(len(l)) + '</h4>'
externallinks = [] # external links list
for link in l:
# if it's external link
if link['href'].find("http://") == 0 and link['href'].find(localurl) == -1:
externallinks = externallinks + [link]
print "<h4>External links count = " + str(len(externallinks)) + '</h4>'
if len(externallinks) > 0:
print "<h3>External links list:</h3>"
for link in externallinks:
if link.text != '':
print '<h5>' + link.text.encode('utf-8')
print ' => [' + '<a href="' + link['href'] + '" >' + link['href'] + '</a>' + ']' + '</h5>'
else:
print '<h5>' + '[image]',
print ' => [' + '<a href="' + link['href'] + '" >' + link['href'] + '</a>' + ']' + '</h5>'
PrintLinks( "http://www.zlatestranky.cz/pro-mobily/")

The solution was very simple. I should used urllib2.quote().
EDITED CODE:
for liTag in lis:
aTag = liTag.find('a')['href']
href = "http://www.hovno.cz"+urllib2.quote(aTag.encode("utf-8"))
soup = getSoup(href)

Couple of things here.
First, you URIs can't contain non-ASCII. You have to replace them. See this:
How to fetch a non-ascii url with Python urlopen?
Secondly, save yourself a world of pain and use requests for HTTP stuff.

Python + BeautifulSoup Exporting to CSV

I'm having a bit of trouble automatically scraping data in a table from a Wikipedia article. First I was getting an encoding error. I specified UTF-8 and the error went away, but the scraped data doesn't display a lot of the characters correctly. You will be able to tell from the code that I am a complete newbie:
from bs4 import BeautifulSoup
import urllib2
wiki = "http://en.wikipedia.org/wiki/Anderson_Silva"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
Result = ""
Record = ""
Opponent = ""
Method = ""
Event = ""
Date = ""
Round = ""
Time = ""
Location = ""
Notes = ""
table = soup.find("table", { "class" : "wikitable sortable" })
f = open('output.csv', 'w')
for row in table.findAll("tr"):
cells = row.findAll("td")
#For each "tr", assign each "td" to a variable.
if len(cells) == 10:
Result = cells[0].find(text=True)
Record = cells[1].find(text=True)
Opponent = cells[2].find(text=True)
Method = cells[3].find(text=True)
Event = cells[4].find(text=True)
Date = cells[5].find(text=True)
Round = cells[6].find(text=True)
Time = cells[7].find(text=True)
Location = cells[8].find(text=True)
Notes = cells[9].find(text=True)
write_to_file = Result + "," + Record + "," + Opponent + "," + Method + "," + Event + "," + Date + "," + Round + "," + Time + "," + Location + "\n"
write_to_unicode = write_to_file.encode('utf-8')
print write_to_unicode
f.write(write_to_unicode)
f.close()

As pswaminathan pointed out, using the csv module will help greatly. Here is how I do it:
table = soup.find('table', {'class': 'wikitable sortable'})
with open('out2.csv', 'w') as f:
csvwriter = csv.writer(f)
for row in table.findAll('tr'):
cells = [c.text.encode('utf-8') for c in row.findAll('td')]
if len(cells) == 10:
csvwriter.writerow(cells)
Discussion
Using the csv module, I created a csvwriter object connected to my output file.
By using the with command, I don't need to worry about closing the output file after done: it will be closed after the with block.
In my code, cells is a list of UTF8-encoded text extracted from the td tags within a tr tag.
I used the construct c.text, which is more concise than c.find(text=True).

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Error while trying to loop through webpages for datascraping - python

I tried to run your code and ran into a ssl certificate error CERTIFICATE_VERIFY_FAILED which seems to be a problem with the wesite you are trying to scrape and not your code. Maybe this stack can help clear things: "SSL: certificate_verify_failed" error when scraping https://www.thenewboston.com/

for letter in ascii_lowercase: soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/") In the url you provided, you are encountering a 404 error when letter = 'x'. Looks like that player index does not exist, make sure you check for that case when going through the letters.

Related

ValueError: I/O operation on closed file. WebScraping BeatutifulSoup/Python

Web Scraping - ResultSet object has no attribute 'findAll'

Crawling output into multiple csv files

Opening page using urllib2 - diacritics

Python + BeautifulSoup Exporting to CSV

Categories

Resources