Complete newbie but I've managed to successfully scrape EAN numbers with Python from a list of links created by an upstream piece of code. However, my output file contains all the scraped numbers as a continuous single line instead of one EAN per line.
Here's my code - what's wrong with it? (scraped URL redacted)
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
subpage = 1
while subpage <= 2:
URL = "https://..." + str(subpage)
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
"""writes all links under the h2 tag into a list"""
links = []
h2s = soup.find_all("h2")
for h2 in h2s:
links.append("http://www.xxxxxxxxxxx.com" + h2.a['href'])
"""opens links from list and extracts EAN number from underlying page"""
with open("temp.txt", "a") as output:
for link in links:
urllib.request.urlopen(link)
page_2 = requests.get(link)
soup_2 = BeautifulSoup(page_2.content, "html.parser")
if "EAN:" in soup_2.text:
span = soup_2.find(class_="articleData_ean")
EAN = span.a.text
output.write(EAN)
subpage += 1
os.replace('temp.txt', 'EANs.txt')
output.write(EAN) is writing each EAN without anything between them. It doesn't automatically add a separator or newline. You can add a newline: output.write('\n') or comma, etc. to separate them
I am new to python and I am trying to turn scraping data to a CSV file but without success.
Here is the code:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import random
import re
from itertools import cycle
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>') #cleaning the strings from these terms
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def scrape(url, filename, number_id):
"""
This function scrapes a web page looking for text inside its html structure and saves it in .txt file.
So it works only for static content, if you need text in a dynamic part of the web page (e.g. a banner)
look at the other file. Pay attention that the retrieved text must be filtered out
in order to keep only the part you need.
url: url to scrape
filename: name of file where to store text
number_id: itis appended to the filename, to distinguish different filenames
"""
#here there is a list of possible user agents
user_agent = random.choice(user_agent_list)
req = Request(url, headers={'User-Agent': user_agent})
page = urlopen(req).read()
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, "html.parser")
row = soup.find_all(class_="row")
for element in row:
viaggio = element.find_all(class_="nowrap")
Partenza = viaggio[0]
Ritorno = viaggio[1]
Viaggiatori = viaggio[2]
Costo = viaggio[3]
Title = element.find(class_="taglist bold")
Content = element.find("p")
Destination = Title.text
Review = Content.text
Departure = Partenza.text
Arrival = Ritorno.text
Travellers = Viaggiatori.text
Cost = Costo.text
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
print(TuristiPerCasoList)
Till here, everything works. Now I have to turn it into a CSV file.
I tried with this:
import csv
with open('turistipercaso','w') as file:
writer = csv.writer(file)
writer.writerows(TuristiPerCasoList)
but it doesn't return anything in the CSV file.
Can someone help me understanding what to do to turn into a CSV file?
In each iteration, you are reassigning the TuristiPerCasoList value.
What you actually want is a list of list of strings, where the string is the value for a specific cell, the second list contains the values of a row and the first list contains all the rows.
To achieve this, you should append a list representing a row to the main list:
# instead of
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
# use
TuristiPerCasoList.append([Destination, Review, Departure, Arrival, Travellers, Cost])
Been using beautiful soup to iterate through pages, but for whatever reason I can't get the loop to advance beyond the first page. it seems like it should be easy because it's a text string, but it seems to loop back, maybe it's my structure not my text string?
Here's what I have:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
this gets the same first 40 records over and over.
I tried using this solution as an if and did find that doing
prevLink = soup.select('a[rel="nofollow"]')[0]
newurl = "http:" + prevLink.get('href')
did work better, but I'm not sure how to do the loop in such a way that it advances? possibly just tired but my loop there still just goes to the next set of records and gets stuck on that one. please help me fix my loop
UPDATE
my formatting was lost in the copy paste, my actual code looks like:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
Your code indenting was mostly at fault. Also it would be wise to actually use the CSV library you imported, this will automatically wrap the player names in quotes to avoid any commas inside from ruining the csv structure.
This works by looking for the link to the next page and extracting the starting count. This is then used to build your the next page get. If no next page can be found, it moves to the next year group. Note, the count is not a page count but a starting entry count.
import csv
import urllib2
from bs4 import BeautifulSoup
groups= ['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015", "2014", "2013", "2012"]
with open('nhlstats.csv', "wb") as f_output:
csv_output = csv.writer(f_output)
for yr in year:
for gr in groups:
start_count = 1
while True:
#print "{}, {}, {}".format(yr, gr, start_count) # show progress
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/{}/count/{}".format(yr, start_count)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row = [yr]
for ob in range(1, 15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
csv_output.writerow(row)
try:
start_count = int(soup.find(attrs= {"class":"page-numbers"}).find_next('a')['href'].rsplit('/', 1)[1])
except:
break
Using with will also automatically close your file at the end.
This would give you a csv file starting as follows:
2016,"Patrick Kane, RW",CHI,82,46,60,106,17,30,1.29,287,16.0,9,17,20
2016,"Jamie Benn, LW",DAL,82,41,48,89,7,64,1.09,247,16.6,5,17,13
2016,"Sidney Crosby, C",PIT,80,36,49,85,19,42,1.06,248,14.5,9,10,14
2016,"Joe Thornton, C",SJ,82,19,63,82,25,54,1.00,121,15.7,6,8,21
You are changing the URL many times before you are opening it the first time, due to an indentation error. Try this:
for gr in groups:
url = "...some_url..."
page = urllib2.urlopen(url)
...everything else should be indented....
My code:
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
base = "https://realpython.com/practice/"
def page_names():
for a in links:
files.append(base + a['href'])
page_names()
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup
The first half of the parsing collects three links, the second half is supposed to print out all of their html.
Sadly, it only prints the last link's html.
Possibly because of
for i in files:
all_page = urlopen(i)
It was working previously with 8 lines of code serving the for i in files: purpose but I wanted to clean it up and got it down to those two. Well, clearly not because it doesn't work.
No error though!
You only store the last value in your loop, you need to move all the assignments and the print inside the loop:
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup
If you are going to use functions I would pass parameters and create the list otherwise you might get unexpected output:
def page_names(b,lnks):
files = []
for a in lnks:
files.append(b + a['href'])
return files
for i in page_names(base,links):
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_s
Your function can then return a list comprehension:
def page_names(b,lnks):
return [b + a['href'] for a in lnks]
In your for loop you are assinging to all_page, which will overwrite it on each loop through, so it will only ever have the value of the last iteration.
If you want it to print the all_soup for each page you could just indent those 3 lines to be inside the for loop as well, then they would be executed each time through the loop.
It seems to be jsut a formatting issue, you probably meant to print it in the loop, right?
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup
I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel