From scraping to a CSV file - python

I am new to python and I am trying to turn scraping data to a CSV file but without success.
Here is the code:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import random
import re
from itertools import cycle
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>') #cleaning the strings from these terms
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def scrape(url, filename, number_id):
"""
This function scrapes a web page looking for text inside its html structure and saves it in .txt file.
So it works only for static content, if you need text in a dynamic part of the web page (e.g. a banner)
look at the other file. Pay attention that the retrieved text must be filtered out
in order to keep only the part you need.
url: url to scrape
filename: name of file where to store text
number_id: itis appended to the filename, to distinguish different filenames
"""
#here there is a list of possible user agents
user_agent = random.choice(user_agent_list)
req = Request(url, headers={'User-Agent': user_agent})
page = urlopen(req).read()
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, "html.parser")
row = soup.find_all(class_="row")
for element in row:
viaggio = element.find_all(class_="nowrap")
Partenza = viaggio[0]
Ritorno = viaggio[1]
Viaggiatori = viaggio[2]
Costo = viaggio[3]
Title = element.find(class_="taglist bold")
Content = element.find("p")
Destination = Title.text
Review = Content.text
Departure = Partenza.text
Arrival = Ritorno.text
Travellers = Viaggiatori.text
Cost = Costo.text
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
print(TuristiPerCasoList)
Till here, everything works. Now I have to turn it into a CSV file.
I tried with this:
import csv
with open('turistipercaso','w') as file:
writer = csv.writer(file)
writer.writerows(TuristiPerCasoList)
but it doesn't return anything in the CSV file.
Can someone help me understanding what to do to turn into a CSV file?

In each iteration, you are reassigning the TuristiPerCasoList value.
What you actually want is a list of list of strings, where the string is the value for a specific cell, the second list contains the values of a row and the first list contains all the rows.
To achieve this, you should append a list representing a row to the main list:
# instead of
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
# use
TuristiPerCasoList.append([Destination, Review, Departure, Arrival, Travellers, Cost])

Related

BeautifulSoup 4 HTML Web Scraping - Find Mailto Links and Export to Spreadsheet

I am trying to scrape all email addresses from this index page - http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL
I modified a python script to define the string, parse content with BS4 and save each unique address to an xls file:
import requests
from bs4 import BeautifulSoup
import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet('Emails')
ws.write(0,0,'Emails')
emailList= []
r=0
#add url of the page you want to scrape to urlString
urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
#function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
getH=requests.get(urlString)
h=getH.content
soup=BeautifulSoup(h,'html.parser')
mailtos = soup.select('a[href^=mailto]')
for i in mailtos:
href=i['href']
try:
str1, str2 = href.split(':')
except ValueError:
break
emailList.append(str2)
emailExtractor(urlString)
#adding scraped emails to an excel sheet
for email in emailList:
r=r+1
ws.write(r,0,email)
wb.save('emails.xls')
The xls file exports as expected, but with no email values. If anyone can explain why or how to simplify this solution it would be greatly appreciated!
Because the emails are protected. I am adding only the email scraping part. and not adding the excel part since you dont have issues with that. converting email protected to text credit goes to https://stackoverflow.com/a/36913154/7518304
emailList= []
r=0
#add url of the page you want to scrape to urlString
urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
def decodeEmail(e): #https://stackoverflow.com/a/36913154/7518304
de = ""
k = int(e[:2], 16)
for i in range(2, len(e)-1, 2):
de += chr(int(e[i:i+2], 16)^k)
return de
#function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
getH=requests.get(urlString)
h=getH.content
soup=BeautifulSoup(h,'html.parser')
mailtos = soup.select('a[href]')
for i in mailtos:
href=i['href']
if "email-protect" in href:
emailList.append(decodeEmail(href.split("#")[1]))
emailExtractor(urlString)
emailList
You can use pandas for this. Here is the full code:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
urlString = 'http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
# function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
emailList = []
getH = requests.get(urlString)
h = getH.content
soup = BeautifulSoup(h, 'html.parser')
mailtos = soup.find_all('a')
href_lst = []
for i in mailtos:
href_lst.append(i['href'])
for href in href_lst:
if ':' in href:
emailList.append(href)
print(emailList)
s = pd.Series(emailList)
s = s.rename('Emails')
s.to_excel('D:\\Emails.xls',index=False)
emailExtractor(urlString)
Output:
['http://msa.uschess.org/AffDtlMain.php?T6006791', 'https://alabamachess.org', 'http://msa.uschess.org/AffDtlMain.php?A6029262', 'http://www.caesarchess.com/', 'http://msa.uschess.org/AffDtlMain.php?A6045660', 'http://msa.uschess.org/AffDtlMain.php?H6046485', 'http://msa.uschess.org/AffDtlMain.php?A6040580']
Excel Sheet Screenshot:
If you want the links to be output to the excel sheet as hyperlinks (you will be redirected to the website once you click the link), then change emailList.append(href) to emailList.append('=HYPERLINK("'+href+'")').
And at the same time, you should also change the file extension to .xlsx. Only then, you can get the links as hyperlinks.
Output:
Hope this helps!

Creating multiple text files with unique file names from scraped data

I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiƫren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!
You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:

Python extract and append data into data frame

I've scraped the website for my research but I couldn't find the right way to extract it into data frame. I believe that my problem is related with list objects that are between lines 36 and 38.
The print line has worked very nice that I can see the final version of data frame in the Python console.
The solution can be really easy but I couldn't figure it out. Thanks in advance for all help.
from time import sleep
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd
# Insert the hisghest page number for website
highest_number = 12
def total_page_number(url):
all_webpage_links = []
all_webpage_links.insert(0, url)
pages = [str(each_number) for each_number in range(2, highest_number)]
for page in pages:
link = ''.join(url + '&page=' + page)
all_webpage_links.append(link)
return all_webpage_links
# Use total_page_number function to create page list for website
All_page = total_page_number(
'https://www.imdb.com/search/title?countries=tr&languages=tr&locations=Turkey&count=250&view=simple')
def clean_text(text):
""" Removes white-spaces before, after, and between characters
:param text: the string to remove clean
:return: a "cleaned" string with no more than one white space between
characters
"""
return ' '.join(text.split())
# Create list objects for data
# Problem occurs in this line !!!!!!
actor_names = []
titles = []
dates = []
def get_cast_from_link(movie_link):
""" Go to the IMDb Movie page in link, and find the cast overview list.
Prints tab-separated movie_title, actor_name, and character_played to
stdout as a result. Nothing returned
:param movie_link: string of the link to IMDb movie page (http://imdb.com
...)
:return: void
"""
movie_page = requests.get(movie_link)
# Use SoupStrainer to strain the cast_list table from the movie_page
# This can save some time in bigger scraping projects
cast_strainer = SoupStrainer('table', class_='cast_list')
movie_soup = BeautifulSoup(movie_page.content, 'html.parser', parse_only=cast_strainer)
# Iterate through rows and extract the name and character
# Remember that some rows might not be a row of interest (e.g., a blank
# row for spacing the layout). Therefore, we need to use a try-except
# block to make sure we capture only the rows we want, without python
# complaining.
for row in movie_soup.find_all('tr'):
try:
actor = clean_text(row.find(itemprop='name').text)
actor_names.append(actor)
titles.append(movie_title)
dates.append(movie_date)
print('\t'.join([movie_title, actor, movie_date]))
except AttributeError:
pass
# Export data frame
# Problem occurs in this line !!!!!!
tsd_df = pd.DataFrame({'Actor_Names': actor_names,
'Movie_Title': titles,
'Movie_Date': dates})
tsd_df.to_csv('/Users/ea/Desktop/movie_df.tsv', encoding='utf-8')
for each in All_page:
# Use requests.get('url') to load the page you want
web_page = requests.get(each)
# https://www.imdb.com/search/title?countries=tr&languages=tr&count=250&view=simple&page=2
# Prepare the SoupStrainer to strain just the tbody containing the list of movies
list_strainer = SoupStrainer('div', class_='lister-list')
# Parse the html content of the web page with BeautifulSoup
soup = BeautifulSoup(web_page.content, 'html.parser', parse_only=list_strainer)
# Generate a list of the "Rank & Title" column of each row and iterate
movie_list = soup.find_all('span', class_='lister-item-header')
for movie in movie_list:
movie_title = movie.a.text
movie_date = movie.find('span', class_='lister-item-year text-muted unbold').text
# get the link to the movie's own IMDb page, and jump over
link = 'http://imdb.com' + movie.a.get('href')
get_cast_from_link(link)
# remember to be nice, and sleep a while between requests!
sleep(15)

How to extract data from all urls, not just the first

This script is generating a csv with the data from only one of the urls fed into it. There are meant to be 98 sets of results, however the for loop isn't getting past the first url.
I've been working on this for 12hrs+ today, what am I missing in order get the correct results?
import requests
import re
from bs4 import BeautifulSoup
import csv
#Read csv
csvfile = open("gyms4.csv")
csvfilelist = csvfile.read()
def get_page_data(urls):
for url in urls:
r = requests.get(url.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup # N.B. use yield instead of return
print r.text
with open("gyms4.csv") as url_file:
for page in get_page_data(url_file):
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
th = pages.find('b',text="Category")
td = th.findNext()
for link in td.findAll('a',href=True):
match = re.search(r'http://(\w+).(\w+).(\w+)', link.text)
if match:
web_address = link.text
gyms = [name,address,phoneNum,email,web_address]
gyms.append(gyms)
#Saving specific listing data to csv
with open ("xgyms.csv", "w") as file:
writer = csv.writer(file)
for row in gyms:
writer.writerow([row])
You have 3 for-loops in your code and do not specifiy which one causes problem. I assume it is the one in get_page_date() function.
You leave the looop exactly in the first run with the return assignemt. That is why you never get to the second url.
There are at least two possible solutions:
Append every parsed line of url to a list and return that list.
Move you processing code in the loops and append the parsed data to gyms in the loop.
As Alex.S said, get_page_data() returns on the first iteration, hence subsequent URLs are never accessed. Furthermore, the code that extracts data from the page needs to be executed for each page downloaded, so it needs to be in a loop too. You could turn get_page_data() into a generator and then iterate over the pages like this:
def get_page_data(urls):
for url in urls:
r = requests.get(url.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup # N.B. use yield instead of return
with open("gyms4.csv") as url_file:
for page in get_page_data(url_file):
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
# etc. etc.
You can write the data to the CSV file as each page is downloaded and processed, or you can accumulate the data into a list and write it in one for with csv.writer.writerows().
Also you should pass the URL list to get_page_data() rather than accessing it from a global variable.

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel

Categories