python give column name and write value in separate column as table - python

my code
from lxml import html
import requests
import csv
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# example site
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
#This will create a list of services:
tname = tree.xpath('//*[#id="colLeft"]//table//tr/td[1]/text()')
tvalue = tree.xpath('//table//tr/td[2]/text()')
print tname
print tvalue
print 'Input the csv file'
csvfile = raw_input("> ")
res = tname,tvalue
#Assuming res is a list of lists
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(res)
my output in csv
Reynolds American Inc. Consolidated-Tomoka Land Co. British American Tobacco
8.30% 7.50% 7.10% 6.60% 6.40% 5.90% 5.30% 4.80% 4.70% 4.10%
Required output same as in website with coulmn name
Ref http://www.wintergreenfund.com/reports/top-ten/
And also unicode is not working .need help on this
my new code
from lxml import html
import requests
import csv
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
csvrows = []
for rows in tree.xpath('//*[#id="colLeft"]//table//tr'):
csvrows.append([rows.xpath('./td[1]/text()'),rows.xpath('./td[2]/text()')])
print csvrows
print 'Input the csv file'
csvfile = raw_input("> ")
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(['Name','Value']) #substitute as appropriate.
writer.writerows(csvrows)
I am getting value with [' '] in it and also empty [ ]

First thing , if you want to combine two lists at each corresponding index , you should use zip() , currently you are creating a tuple of two lists in line - res = tname,tvalue - and then writing it as is to the csv.
Also, secondly, you should first use xpath to get each row in the table, and then use xpath to get each required td element from it. Rather than using two xpaths as you are using currently.
Example -
from lxml import html
import requests
import csv
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
csvrows = []
for rows in tree.xpath('//*[#id="colLeft"]//table//tr'):
row1text = rows.xpath('./td[1]/text()')
row2text = rows.xpath('./td[2]/text()')
if row1text and row2text:
csvrows.append([row1text[0],row2text[0]])
print(csvrows)
print('Input the csv file')
csvfile = input("> ")
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(['Name','Value']) #substitute as appropriate.
writer.writerows(csvrows)

Related

How to make sure the data is matching while web-scraping to CSV?

I'm extracting data from DESWATER website, these data are then saved in CSV. to make a small example of the issue I have these 2 authors, one having a full text file the other doesn't. Hence, it will save the file to the wrong author.
So the CSV output looks like this:
Authors | File
First Author | Second File
Second Author | Third File
But I want the output like this:
Authors | File
First Author | 'No File'
Second Author | Second File
Third Author | Third File
Here is a small test code:
from bs4 import BeautifulSoup
import requests
import time
import csv
list_of_authors = []
list_of_full_file = []
r = requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009')
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
#'Author'
s = soup.find('td', class_='testo_normale')
authors = s.find_all('i')
for author in authors:
list_of_authors.append(author.text.strip())
time.sleep(1)
#'FULL TEXT'
# find all the anchor tags with "href"
n=1
for link in soup.find_all('a', class_='testo_normale_rosso'):
if "fulltext.php?abst=" in link.get('href'):
# TO ADD
baseurl = 'https://www.deswater.com/'
Full_links=baseurl+link.attrs['href'].replace('\n','')
list_of_full_file.append(f'file {n}')
n+=1
time.sleep(1)
def Save_csv():
row_head =['Author', 'File Name']
Data = []
for author, file in zip(list_of_authors, list_of_full_file):
Data.append(author)
Data.append(file)
rows = [Data[i:i + 2] for i in range(0, len(Data), 2)]
with open('data.csv', 'w', encoding='utf_8_sig', newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(row_head)
csvwriter.writerows(rows)
Save_csv()
This code will ultimately extract data from 279 pages, so I need the code to automatically detect that there is no Full Text for this author, so I can append it as 'No File'
See the reference of the correct matching in the website here.
The first author doesn't have a full text file.
Any Ideas?
Try to change your strategy selecting the elements and avoid multiple lists if yo could not ensure same length.
Use css selectors here to select all <hr> that are the base for all other selections with find_previous():
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
Example
from bs4 import BeautifulSoup
import requests
import csv
soup = BeautifulSoup(requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009').content)
with open('data.csv', 'w', encoding='utf-8', newline='') as f:
data = []
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
Output
author,file
Miriam Balaban,no url
W. Richard Bowen,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzEucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kW.k#13#kRichardk#13#kBowenk#1#kk#4#kik#2#kk#1#kbrk#2#kWaterk#13#kengineeringk#13#kfork#13#kthek#13#kpromotionk#13#kofk#13#kpeacek#1#kbrk#2#k1k#15#k2009k#16#k1k#35#k6k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k1.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NC9URFdUX0FfMTA1MTI4NjRfTy5wZGY=&type=1
Steven J. Duranceau,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzcucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kStevenk#13#kJ.k#13#kDuranceauk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#kpermeatek#13#ktransientk#13#kresponsek#13#ktok#13#kperturbationsk#13#kfromk#13#ksteadyk#13#kstatek#13#kink#13#kak#13#knanofiltrationk#13#kprocessk#1#kbrk#2#k1k#15#k2009k#16#k7k#35#k16k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k7.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NS9URFdUX0FfMTA1MTI4NjVfTy5wZGY=&type=1
"Dmitry Lisitsin, David Hasson, Raphael Semiat",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzE3LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kDmitryk#13#kLisitsink#6#kk#13#kDavidk#13#kHassonk#6#kk#13#kRaphaelk#13#kSemiatk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#keffectk#13#kofk#13#kantik#35#kscalantk#13#konk#13#kCaCO3k#13#kprecipitationk#13#kink#13#kcontinuousk#13#kflowk#1#kbrk#2#k1k#15#k2009k#16#k17k#35#k24k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k17.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ni9URFdUX0FfMTA1MTI4NjZfTy5wZGY=&type=1
"M.A. Darwish, Fatima M. Al-Awadhi, A. Akbar, A. Darwish",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzI1LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kM.A.k#13#kDarwishk#6#kk#13#kFatimak#13#kM.k#13#kAlk#35#kAwadhik#6#kk#13#kA.k#13#kAkbark#6#kk#13#kA.k#13#kDarwishk#1#kk#4#kik#2#kk#1#kbrk#2#kAlternativek#13#kprimaryk#13#kenergyk#13#kfork#13#kpowerk#13#kdesaltingk#13#kplantsk#13#kink#13#kKuwaitk#32#kk#13#kthek#13#knucleark#13#koptionk#13#kIk#1#kbrk#2#k1k#15#k2009k#16#k25k#35#k41k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k25.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ny9URFdUX0FfMTA1MTI4NjdfTy5wZGY=&type=1
...

Using pythons parse with criteria

First of all I got to say that I have very little experience with any sort of coding so even I dont completely know what Im after here, but Im trying my best!
Ive been writing this code that takes the HTML of a certain website and then gives me .CSV file of the elements(?) that are named (you can see these in the inspect panel of the website).
So my question is, how can I use criteria with my current code so I can tell the code to only return words with, for example, the letter g in them?
Im happy to elaborate!
Thank you already!
import urllib.request
from bs4 import BeautifulSoup
import csv
url = 'https://kouluruoka.fi/menu/kouvola_koulujenruokalista'
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
parse = BeautifulSoup(content, 'html.parser')
#These texts get words in <h2> and <span> named elements
text1 = parse.find_all('h2')
text2 = parse.find_all('span')
#This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for col1,col2 in zip(text1, text2):
writer.writerow([col1.get_text().strip(), col2.get_text().strip()])
You can check if elements contains some string/letter this way:
h2_elements = parse.find_all('h2')
span_elements = parse.find_all('span')
# This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for h2_element, span_element in zip(h2_elements, span_elements):
h2_element_str = h2_element.get_text().strip()
span_element_str = span_element.get_text().strip()
if 'a' in h2_element_str and 'a' in span_element_str:
writer.writerow([h2_element_str, span_element_str])

How to read the entire urls from the first column in a csv file

I am trying to read the urls from the first column in a csv file. In the csv file, there are 6051 urls in total which I want to read. To do so, I tried the following codes:
urls = []
with open("C:/Users/hyoungm/Downloads/urls.csv") as csvfile:
blogurl = csv.reader(csvfile)
for row in blogurl:
row = row[0]
print(row)
len(row)
However, the number of urls that are shown is only 65. I have no idea why the total number of urls appears differently from the csv file.
Can anybody help me with figuring out how to read all urls (6051 in total) from the csv file?
To read all the urls from the csv file, I also tried several different codes that resulted in the same number of urls (i.e., 65 urls) or failure, such as:
1)
openfile = open("C:/Users/hyoungm/Downloads/urls.csv")
r = csv.reader(openfile)
for i in r:
#the urls are in the first column ... 0 refers to the first column
blogurls = i[0]
print (blogurls)
len(blogurls)
2)
urls = pd.read_csv("C:/Users/hyoungm/Downloads/urls.csv")
with closing(requests.get(urls, stream = True)) as r:
reader = csv.reader(r.iter_lines(), delimiter = ',', quotechar = '""')
for row in reader:
print(row)
len(row)
3)
with open("C:/Users/hyoungm/Downloads/urls.csv") as csvfile:
lines = csv.reader(csvfile)
for i, line in enumerate(lines):
if i == 0:
for line in csvfile:
print(line[1:])
len(line)
4) and
blogurls = []
with open("C:/Users/hyoungm/Downloads/urls.csv") as csvfile:
r = csv.reader(csvfile)
for i in r:
blogurl = i[0]
r = requests.get(blogurl)
blogurls.append(blogurl)
for url in blogurls:
page = urlopen(url[0]).read()
soup = BeautifulSoup(page, "html.parser")
len(blogurls)
I expect the output of 6051 urls as originally collected in the csv file, instead of 65 urls.
After reading all the urls, I am going to scrawl down the textual data from each url. I supposed to get the following textual data using all 6051 urls. Please click the following link for the image:
the codes and the outcomes based on 65 urls so far
The following two approaches work for me:
import requests
r = requests.get('https://raw.githubusercontent.com/GemmyMoon/MultipleUrls/master/urls.csv')
urls = r.text.splitlines()
print(len(urls)) # Returns 6051
and
import csv
import requests
from io import StringIO
r = requests.get('https://raw.githubusercontent.com/GemmyMoon/MultipleUrls/master/urls.csv')
reader = csv.reader(StringIO(r.text))
urls = [line[0] for line in reader]
print(len(urls)) # Returns 6051

Python - assign print output csv

I am working on a project to scrape multiple twitter URL's and assign their follower count to a csv:
username= ['LazadaPH','ZALORAPH','ShopeePH','eBayPhilippines','beauty_MNL']
for user in username:
url = 'https://www.twitter.com/'+ user
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
f = soup.find('li', class_="ProfileNav-item--followers")
title = f.find('a')['title']
num_followers = int(title.split(' ')[0].replace(',',''))
print(user,num_followers)
The output looks as follows:
LazadaPH 52841
ZALORAPH 29786
ShopeePH 7004
eBayPhilippines 874
beauty_MNL 2469
Since I'm quite new to python (and don't hope to be asking a redundant question): but can someone guide me to sources and tutorials of how to assign this printed output to a csv and essentialy extract it into two columns (column 1 is website string and column 2 the follower count).
Any suggestions?
Thanks a bunch!
You can use the CSV module
Ex:
import csv
with open('out.csv', 'w') as csvfile:
r = csv.writer(csvfile, delimiter=',') # ----> COMMA Seperated
for user in username:
url = 'https://www.twitter.com/'+ user
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
f = soup.find('li', class_="ProfileNav-item--followers")
title = f.find('a')['title']
num_followers = int(title.split(' ')[0].replace(',',''))
r.writerow([user,num_followers]) # ----> Adding Rows
Make your print statement like this:
print(user,';',num_followers)
So that it prints ';' as separator for values. Then pipe the output to a file:
python yourscript.py > yourcsv.csv

How to save the string, one word per column in Python?

I'm scraping the names of massage therapists along with their addresses from a directory. The addresses are all being saved into the CSV in one column for the whole string, but the title/name of each therapist is being saved one word per column over 2 or 3 columns.
What do I need to do in order to get the string that's being extracted to save in one column, like the addresses are being saved? (The top two lines of code are example html from the page, the next set of code is the extract from the script targeting this element)
<span class="name">
<img src="/images/famt-placeholder-sm.jpg" class="thumb" alt="Tiffani D Abraham"> Tiffani D Abraham</span>
import mechanize
from lxml import html
import csv
import io
from time import sleep
def save_products (products, writer):
for product in products:
for price in product['prices']:
writer.writerow([ product["title"].encode('utf-8') ])
writer.writerow([ price["contact"].encode('utf-8') ])
writer.writerow([ price["services"].encode('utf-8') ])
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
links = ["https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=2&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=3&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=4&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=5&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=6&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=7&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=8&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=9&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=10&PageSize=10" ]
br = mechanize.Browser()
for link in links:
print(link)
r = br.open(link)
content = r.read()
products = []
tree = html.fromstring(content)
product_nodes = tree.xpath('//ul[#class="famt-results"]/li')
for product_node in product_nodes:
product = {}
price_nodes = product_node.xpath('.//a')
product['prices'] = []
for price_node in price_nodes:
price = {}
try:
product['title'] = product_node.xpath('.//span[1]/text()')[0]
except:
product['title'] = ""
try:
price['services'] = price_node.xpath('./span[2]/text()')[0]
except:
price['services'] = ""
try:
price['contact'] = price_node.xpath('./span[3]/text()')[0]
except:
price['contact'] = ""
product['prices'].append(price)
products.append(product)
save_products(products, writer)
f_out.close()
I'm not positive if this solves the issue you were having, but either way there are a few improvements and modifications you might be interested in.
For example, since each link varies by a page index you can loop through the links easily rather than copying all 50 down to a list. Each therapist per page also has their own index, so you can also loop through the xpaths for each therapist's information.
#import modules
import mechanize
from lxml import html
import csv
import io
#open browser
br = mechanize.Browser()
#create file headers
titles = ["NAME"]
services = ["TECHNIQUE(S)"]
contacts = ["CONTACT INFO"]
#loop through all 50 webpages for therapist data
for link_index in range(1,50):
link = "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=" + str(link_index) + "&PageSize=10"
r = br.open(link)
page = r.read()
tree = html.fromstring(page)
#loop through therapist data for each therapist per page
for therapist_index in range(1,10):
#store names
title = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[1]/text()')
titles.append(" ".join(title))
#store techniques and convert to unicode
service = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[2]/text()')
try:
services.append(service[0].encode("utf-8"))
except:
services.append(" ")
#store contact info and convert to unicode
contact = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[3]/text()')
try:
contacts.append(contact[0].encode("utf-8"))
except:
contacts.append(" ")
#open file to write to
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
#get rows in correct format
rows = zip(titles, services, contacts)
#write csv line by line
for row in rows:
writer.writerow(row)
f_out.close()
The script loops through all 50 links on the provided webpage, and seems to be scraping all relevant information for each therapist if provided. Finally, it prints all the data to a csv with all data stored under respective columns for 'Name', 'Technique(s)', and 'Contact Info' if this is what you were originally struggling with.
Hope this helps!

Categories