Creating multiple text files with unique file names from scraped data

Creating multiple text files with unique file names from scraped data - python

I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiëren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!

You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:

Related

Why does my web scraper write everything into a single line

Complete newbie but I've managed to successfully scrape EAN numbers with Python from a list of links created by an upstream piece of code. However, my output file contains all the scraped numbers as a continuous single line instead of one EAN per line.
Here's my code - what's wrong with it? (scraped URL redacted)
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
subpage = 1
while subpage <= 2:
URL = "https://..." + str(subpage)
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
"""writes all links under the h2 tag into a list"""
links = []
h2s = soup.find_all("h2")
for h2 in h2s:
links.append("http://www.xxxxxxxxxxx.com" + h2.a['href'])
"""opens links from list and extracts EAN number from underlying page"""
with open("temp.txt", "a") as output:
for link in links:
urllib.request.urlopen(link)
page_2 = requests.get(link)
soup_2 = BeautifulSoup(page_2.content, "html.parser")
if "EAN:" in soup_2.text:
span = soup_2.find(class_="articleData_ean")
EAN = span.a.text
output.write(EAN)
subpage += 1
os.replace('temp.txt', 'EANs.txt')

output.write(EAN) is writing each EAN without anything between them. It doesn't automatically add a separator or newline. You can add a newline: output.write('\n') or comma, etc. to separate them

From scraping to a CSV file

I am new to python and I am trying to turn scraping data to a CSV file but without success.
Here is the code:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import random
import re
from itertools import cycle
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>') #cleaning the strings from these terms
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def scrape(url, filename, number_id):
"""
This function scrapes a web page looking for text inside its html structure and saves it in .txt file.
So it works only for static content, if you need text in a dynamic part of the web page (e.g. a banner)
look at the other file. Pay attention that the retrieved text must be filtered out
in order to keep only the part you need.
url: url to scrape
filename: name of file where to store text
number_id: itis appended to the filename, to distinguish different filenames
"""
#here there is a list of possible user agents
user_agent = random.choice(user_agent_list)
req = Request(url, headers={'User-Agent': user_agent})
page = urlopen(req).read()
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, "html.parser")
row = soup.find_all(class_="row")
for element in row:
viaggio = element.find_all(class_="nowrap")
Partenza = viaggio[0]
Ritorno = viaggio[1]
Viaggiatori = viaggio[2]
Costo = viaggio[3]
Title = element.find(class_="taglist bold")
Content = element.find("p")
Destination = Title.text
Review = Content.text
Departure = Partenza.text
Arrival = Ritorno.text
Travellers = Viaggiatori.text
Cost = Costo.text
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
print(TuristiPerCasoList)
Till here, everything works. Now I have to turn it into a CSV file.
I tried with this:
import csv
with open('turistipercaso','w') as file:
writer = csv.writer(file)
writer.writerows(TuristiPerCasoList)
but it doesn't return anything in the CSV file.
Can someone help me understanding what to do to turn into a CSV file?

In each iteration, you are reassigning the TuristiPerCasoList value.
What you actually want is a list of list of strings, where the string is the value for a specific cell, the second list contains the values of a row and the first list contains all the rows.
To achieve this, you should append a list representing a row to the main list:
# instead of
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
# use
TuristiPerCasoList.append([Destination, Review, Departure, Arrival, Travellers, Cost])

loop stuck on first page

Been using beautiful soup to iterate through pages, but for whatever reason I can't get the loop to advance beyond the first page. it seems like it should be easy because it's a text string, but it seems to loop back, maybe it's my structure not my text string?
Here's what I have:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
this gets the same first 40 records over and over.
I tried using this solution as an if and did find that doing
prevLink = soup.select('a[rel="nofollow"]')[0]
newurl = "http:" + prevLink.get('href')
did work better, but I'm not sure how to do the loop in such a way that it advances? possibly just tired but my loop there still just goes to the next set of records and gets stuck on that one. please help me fix my loop
UPDATE
my formatting was lost in the copy paste, my actual code looks like:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()

Your code indenting was mostly at fault. Also it would be wise to actually use the CSV library you imported, this will automatically wrap the player names in quotes to avoid any commas inside from ruining the csv structure.
This works by looking for the link to the next page and extracting the starting count. This is then used to build your the next page get. If no next page can be found, it moves to the next year group. Note, the count is not a page count but a starting entry count.
import csv
import urllib2
from bs4 import BeautifulSoup
groups= ['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015", "2014", "2013", "2012"]
with open('nhlstats.csv', "wb") as f_output:
csv_output = csv.writer(f_output)
for yr in year:
for gr in groups:
start_count = 1
while True:
#print "{}, {}, {}".format(yr, gr, start_count) # show progress
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/{}/count/{}".format(yr, start_count)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row = [yr]
for ob in range(1, 15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
csv_output.writerow(row)
try:
start_count = int(soup.find(attrs= {"class":"page-numbers"}).find_next('a')['href'].rsplit('/', 1)[1])
except:
break
Using with will also automatically close your file at the end.
This would give you a csv file starting as follows:
2016,"Patrick Kane, RW",CHI,82,46,60,106,17,30,1.29,287,16.0,9,17,20
2016,"Jamie Benn, LW",DAL,82,41,48,89,7,64,1.09,247,16.6,5,17,13
2016,"Sidney Crosby, C",PIT,80,36,49,85,19,42,1.06,248,14.5,9,10,14
2016,"Joe Thornton, C",SJ,82,19,63,82,25,54,1.00,121,15.7,6,8,21

You are changing the URL many times before you are opening it the first time, due to an indentation error. Try this:
for gr in groups:
url = "...some_url..."
page = urllib2.urlopen(url)
...everything else should be indented....

Python: Parse from list only prints last item, not all?

My code:
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
base = "https://realpython.com/practice/"
def page_names():
for a in links:
files.append(base + a['href'])
page_names()
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup
The first half of the parsing collects three links, the second half is supposed to print out all of their html.
Sadly, it only prints the last link's html.
Possibly because of
for i in files:
all_page = urlopen(i)
It was working previously with 8 lines of code serving the for i in files: purpose but I wanted to clean it up and got it down to those two. Well, clearly not because it doesn't work.
No error though!

You only store the last value in your loop, you need to move all the assignments and the print inside the loop:
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup
If you are going to use functions I would pass parameters and create the list otherwise you might get unexpected output:
def page_names(b,lnks):
files = []
for a in lnks:
files.append(b + a['href'])
return files
for i in page_names(base,links):
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_s
Your function can then return a list comprehension:
def page_names(b,lnks):
return [b + a['href'] for a in lnks]

In your for loop you are assinging to all_page, which will overwrite it on each loop through, so it will only ever have the value of the last iteration.
If you want it to print the all_soup for each page you could just indent those 3 lines to be inside the for loop as well, then they would be executed each time through the loop.

It seems to be jsut a formatting issue, you probably meant to print it in the loop, right?
for i in files:
all_page = urlopen(i)
all_text = all_page.read()
all_soup = BeautifulSoup(all_text)
print all_soup

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).

import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.

I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Creating multiple text files with unique file names from scraped data - python

Related

Why does my web scraper write everything into a single line

From scraping to a CSV file

loop stuck on first page

Python: Parse from list only prints last item, not all?

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

Categories

Resources