Writerow does not print some variables to the csv file - python

The following code is failing to write the name and address variables to a csv file. When I test it using numbers or words, or the "write' variable, these will be recorded in the csv, but the "Writeaddress" and "WriteName" will not*. (Also, using the original sources for these variables will also leave blanks)
import requests, sys, pyperclip, bs4, csv
StationList = open('CTA Station Addresses.csv', 'w', newline='')
StationWrite = csv.writer(StationList)
for i in range(149):
id = str(i)
res = requests.get('http://www.transitchicago.com/travel_information /station.aspx?StopId=' + id)
res.raise_for_status()
Station = bs4.BeautifulSoup(res.text)
Name = Station.select('.rtehdng')
Address = Station.select('#ctl07_divAddress')
Write = 0
if Name == []:
print('missing name')
Write = 1
else:
#print(Name[0].getText())
WriteName = Name[0].getText()
pass
if Address == []:
print('missing address')
Write = 1
else:
#print(Address[0].getText())
WriteAddress = Address[0].getText()
pass
if Write == 0:
StationWrite.writerow([Write, WriteName, WriteAddress])
Write = 0
StationList.close()
*(I can do "writerows([3, Write, Writename]) and the CSV row will be "3, 0, ")

I couldn't reproduce your error but the data you get has embedded newlines and spaces which can make the csv look odd. I've cleaned up the script and scrubbed the data before writing the csv and ended up with station,address entries. I didn't see a need to write Write because it was always 0 in your script and doesn't even exist in mine since I leverage exception handling instead.
import requests, sys, pyperclip, bs4, csv
with open('CTA Station Addresses.csv', 'w', newline='') as StationList:
StationWrite = csv.writer(StationList)
for i in range(149):
_id = str(i)
res = requests.get('http://www.transitchicago.com/travel_information/station.aspx?StopId=' + _id)
res.raise_for_status()
Station = bs4.BeautifulSoup(res.text, 'lxml')
try:
name = Station.select('.rtehdng')[0].getText().strip()
address = Station.select('#ctl07_divAddress')[0].getText().splitlines()[-1].strip()
except IndexError as e:
print("No data for station", _id)
continue
if not name or not address:
print('Empty elements for station', _id)
continue
print(repr(name), repr(address))
StationWrite.writerow([name, address])

Related

Splitting a big HTML file into multiple smaller files

I extracted a full Discord dm as an HTML file.
It's too big to open up like that, so I wanted to split it into multiple files. I found another post here that had a solution, but I can't seem to figure out what I do wrong. Since the script opens up for 1 second and just closes afterwards with no results.
Here is the code I'm using.
from __future__ import print_function
from lxml import etree, html
from io import StringIO
from pathlib import Path
parser = html.HTMLParser()
header= "<html><body>\n"
footer = "</body></html>\n"
i = 1
fi = 1
messagesPerFile = 3
file = "DMPim.html"
buffer = ""
try:
tree = html.parse(StringIO(Path(file).read_text()), parser)
try:
# target and print all <div class="collectionDiv"> elements and subelements
for element in tree.xpath('//div[#class="chatlog__message-group"]'):
buffer += etree.tostring(element, pretty_print=True).decode("utf-8")
if i % messagesPerFile == 0 and i > 0:
f = open("chat" + str(fi) + ".html", "w+")
f.write(header + buffer + footer)
f.close()
fi+=1
buffer = ""
i+=1
# if remaining elements are still in the buffer, write them out
if buffer != "":
f = open("chat" + str(fi) + ".html", "w+")
f.write(header + buffer + footer)
f.close()
except etree.XPathEvalError as details:
print ('ERROR: XPath expression', details.error_log)
except etree.XMLSyntaxError as details:
print ('ERROR: parser', details.error_log)
I'm pretty new in all this and wanted this as sort of start up project, so excuse me if I'm asking obvious things.

Export all table data from PDF to Excel using Amazon textract

Looking out to extract PDF data to Excel/CSV using Amazon Textract. How we can Insert the Input PDF data from the local folder.
Having PDF with multiple Tables, we need to extract all the tables from their respective pages and export the data to CSV/Excel files. which can be used for further analysis.
Piece of code received from AWS but could not understand how input pdf file can be taken up into the script.
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
return text
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
Sample PDF file Click Here
first you must generate the necessary environments in aws, install awscli and configure it with your aws credentials, having that, you only need to install the corresponding libraries and change the last line of the code:
if __name__ == "__main__": file_name = "name_image.png" main(file_name)
I recommend you to read this publication, to set up your aws environment:
https://medium.com/#victorjatoba10/extract-tables-and-forms-from-pdf-using-amazon-aws-textract-827c6e866453
You can read the file yourself and pass the Bytes to Textract
import os
for filename in os.listdir('input'):
if filename.endswith("jpg"):
with open('input/'+filename, 'rb') as img_file:
img_bytes = img_file.read()
response = client_Textract.analyze_document(Document={'Bytes': img_bytes}, FeatureTypes=["TABLES"])

Write output data to csv

I'm writing a short piece of code in python to check the status code of a list of URLS. The steps are
1. read the URL's from a csv file.
2. Check request code
3. Write the status code request into the csv next to the checked URL
The first two steps I've managed to do but I'm stuck with writing the output of the requests into the same csv, next to the urls. Please help.
import urllib.request
import urllib.error
from multiprocessing import Pool
file = open('innovators.csv', 'r', encoding="ISO-8859-1")
urls = file.readlines()
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
print('200' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=1)
result = p.map(checkurl, urls)
with open('innovators.csv', 'w') as f:
for line in file:
url = ''.join(line)
checkurl(urls + "," + checkurl)
The .readlines() operation leaves the file object at the end of file. When you attempt to loop through the lines of file again, without first rewinding it (file.seek(0)) or closing and opening it again (file.close() followed by opening again), there are no lines remaining. Always recommended to use with open(...) as file construct to ensure file is closed when operation is finished.
Additionally, there appears to be an error in your input to checkurl. You have added a list (urls) to a string (",") to a function (checkurl).
You probably meant for this section to read
with open('innovators.csv', 'w') as f:
for line in urls:
url = ''.join(line.replace('\n','')) # readlines leaves linefeed character at end of line
f.write(url + "," + checkurl(url))
The checkurl function should return what you are intending to place into the csv file. You are simply printing to standard output (screen). Thus, replace your checkurl with
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
ret='0'
except urllib.error.HTTPError as e:
ret='HTTPError: {}'.format(e.code)
except urllib.error.URLError as e:
ret='URLError: {}'.format(e.reason)
else:
ret='200'
return ret
or something equivalent to your needs.
Save the status in a dict. and convert it to dataframe. Then simply send it to a csv file. str(code.getcode()) will return 200 if the url is connecting else it will return an exception, for which i assigned status as '000'. So your csv file will contain url,200 if URL is connecting and url,000 if URL is not connecting.
status_dict={}
for line in lines:
try:
code = urllib.request.urlopen(line)
status = str(code.getcode())
status_dict[line] = status
except:
status = "000"
status_dict[line] = status
df = pd.Dataframe(status_dict)
df.to_csv('filename.csv')

Extract tags from one column in CSV using Python [duplicate]

This question already has answers here:
Parsing out single column from csv into text file using python
(3 answers)
Closed 8 years ago.
I am trying to extract tagged entities from a csv file using python. This file contains tagged entities in multiple columns of the csv file. I only want python to process one specific column. Can anybody show me how to do this?
This is my code:
from bs4 import BeautifulSoup
import csv
input_name = "file.csv" # File names for input and output
output_name = "entities.csv"
def incrementEntity(entity_string, dictionary):
try:
dictionary[entity_string] += 1
except KeyError:
dictionary[entity_string] = 1
def outputResults(dictionary, entity_type, f):
for i in sorted(dictionary, key=dictionary.get, reverse=True):
print i, '\t', entity_type, '\t', dictionary[i]
f.writerow([i, entity_type, dictionary[i]])
try:
f = open(input_name, 'r')
soup = BeautifulSoup(f)
f.close()
except IOError, message:
print message
raise ValueError("Input file could not be opened")
locations = {}
people = {}
orgs = {}
for i in soup.find_all():
entity_name = i.get_text()
entity_type = i.name
if (entity_type == 'i-loc' or entity_type == 'b-loc'):
incrementEntity(entity_name, locations)
elif (entity_type == 'b-org' or entity_type == 'i-org'):
incrementEntity(entity_name, orgs)
elif (entity_type == 'b-per' or entity_type == 'i-per'):
incrementEntity(entity_name, people)
else:
continue
output_file = open(output_name, 'w')
f = csv.writer(output_file)
print "Entity\t\tType\t\tCount"
print "------\t\t----\t\t-----"
f.writerow(["Entity", "Type", "Count"])
outputResults(locations, 'location', f)
outputResults(people, 'person', f)
outputResults(orgs, 'organization', f)
output_file.close()
By definition, a CSV is a file in which data is separated by commas. So all you have to do is use the .split() method of the string you are dealing with.
Example:
csvline = 'Joe,25,M'
age = csvline.split(',')[1]
I don't know exactly what kind of data you are trying to process, but since you are trying to use BeautifulSoup I will assume your CSV file contains plain HTML-like data in some of its columns AND that you want to join the data of all those columns to process it with BeautifulSoup. That being the case you could try something like:
f = open(input_name, 'r')
htmlstring = '\n'.join([line.split(',')[1] for line in f])
soup = BeautifulSoup(htmlstring)
f.close()

Python - Web Scraping - BeautifulSoup & CSV

I am hoping to extract the change in cost of living from one city against many cities. I plan to list the cities I would like to compare in a CSV file and using this list to create the web link that would take me to the website with the information I am looking for.
Here is the link to an example: http://www.expatistan.com/cost-of-living/comparison/phoenix/new-york-city
Unfortunately I am running into several challenges. Any assistance to the following challenges is greatly appreciated!
The output only shows the percentage, but no indication whether it is more expensive or cheaper. For the example listed above, my output based on the current code shows 48%, 129%, 63%, 43%, 42%, and 42%. I tried to correct for this by adding an 'if-statement' to add '+' sign if it is more expensive, or a '-' sign if it is cheaper. However, this 'if-statement' is not functioning correctly.
When I write the data to a CSV file, each of the percentages is written to a new row. I can't seem to figure out how to write it as a list on one line.
(related to item 2) When I write the data to a CSV file for the example listed above, the data is written in the format listed below. How can I correct the format and have the data written in the preferred format listed below (also without the percentage sign)?
CURRENT CSV FORMAT (Note: 'if-statement' not functioning correctly):
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,1,2,9,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,6,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
PREFERRED CSV FORMAT:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city, 48,129,63,43,42,42
Here is my current code:
import requests
import csv
from bs4 import BeautifulSoup
#Read text file
Textfile = open("City.txt")
Textfilelist = Textfile.read()
Textfilelistsplit = Textfilelist.split("\n")
HomeCity = 'Phoenix'
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
soup_expatistan = BeautifulSoup(page)
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table",class_="comparison")
expatistan_titles = expatistan_table.find_all("tr",class_="expandable")
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th",class_="percent")
percent_difference_title = percent_difference.span['class']
if percent_difference_title == "expensiver":
WriteResultsFile.writerow(Textfilelistsplit[i] + '+' + percent_difference.span.string)
else:
WriteResultsFile.writerow(Textfilelistsplit[i] + '-' + percent_difference.span.string)
i+=1
Answers:
Question 1: the class of the span is a list, you need to check if expensiver is inside this list. In other words, replace:
if percent_difference_title == "expensiver"
with:
if "expensiver" in percent_difference.span['class']
Questions 2 and 3: you need to pass a list of column values to writerow(), not string. And, since you want only one record per city, call writerow() outside of the loop (over the trs).
Other issues:
open csv file for writing before the loop
use with context managers while working with files
try to follow PEP8 style guide
Here's the code with modifications:
import requests
import csv
from bs4 import BeautifulSoup
BASE_URL = 'http://www.expatistan.com/cost-of-living/comparison/{home_city}/{city}'
home_city = 'Phoenix'
with open('City.txt') as input_file:
with open("Expatistan.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerow(["City", "Food", "Housing", "Clothes", "Transportation", "Personal Care", "Entertainment"])
for line in input_file:
city = line.strip()
url = BASE_URL.format(home_city=home_city, city=city)
soup = BeautifulSoup(requests.get(url).text)
table = soup.find("table", class_="comparison")
differences = []
for title in table.find_all("tr", class_="expandable"):
percent_difference = title.find("th", class_="percent")
if "expensiver" in percent_difference.span['class']:
differences.append('+' + percent_difference.span.string)
else:
differences.append('-' + percent_difference.span.string)
writer.writerow([city] + differences)
For the City.txt containing just one new-york-city line, it produces Expatistan.csv with the following content:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city,+48%,+129%,+63%,+43%,+42%,+42%
Make sure you understand what changes have I made. Let me know if you need further help.
csv.writer.writerow() takes a sequence and makes each element a column; normally you'd give it a list with columns, but you are passing in strings instead; that'll add individual characters as columns instead.
Just build a list, then write it to the CSV file.
First, open the CSV file once, not for every separate city; you are clearing out the file every time you open it.
import requests
import csv
from bs4 import BeautifulSoup
HomeCity = 'Phoenix'
with open("City.txt") as cities, open("Expatistan.csv", "wb") as outfile:
writer = csv.writer(outfile)
writer.writerow(["City", "Food", "Housing", "Clothes",
"Transportation", "Personal Care", "Entertainment"])
for line in cities:
city = line.strip()
url = "http://www.expatistan.com/cost-of-living/comparison/{}/{}".format(
HomeCity, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding=resp.encoding)
titles = soup.select("table.comparison tr.expandable")
row = [city]
for title in titles:
percent_difference = title.find("th", class_="percent")
changeclass = percent_difference.span['class']
change = percent_difference.span.string
if "expensiver" in changeclass:
change = '+' + change
else:
change = '-' + change
row.append(change)
writer.writerow(row)
So, first of all, one passes the writerow method an iterable, and each object in that iterable gets written with commas separating them. So if you give it a string, then each character gets separated:
WriteResultsFile.writerow('hello there')
writes
h,e,l,l,o, ,t,h,e,r,e
But
WriteResultsFile.writerow(['hello', 'there'])
writes
hello,there
That's why you are getting results like
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
The rest of your problems are errors in your webscraping. First of all, when I scrape the site, searching for tables with CSS class "comparison" gives me None. So I had to use
expatistan_table = soup_expatistan.find("table","comparison")
Now, the reason your "if statement is broken" is because
percent_difference.span['class']
returns a list. If we modify that to
percent_difference.span['class'][0]
things will work the way you expect.
Now, your real issue is that inside the innermost loop you are finding the % changing in price for the individual items. You want these as items in your row of price differences, not individual rows. So, I declare an empty list items to which I append percent_difference.span.string, and then write the row outside the innermost loop Like so:
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
The final error, is the in the while loop you re-open the csv file, and overwrite everything so you only have the final city in the end. Accounting for all theses errors (many of which you should have been able to find without help) leaves us with:
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
print url
soup_expatistan = BeautifulSoup(page)
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table","comparison")
expatistan_titles = expatistan_table.find_all("tr","expandable")
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
i+=1
YAA - Yet Another Answer.
Unlike the other answers, this treats the data as a series key-value pairs; ie: a list of dictionaries, which are then written to CSV. A list of wanted fields is provided to the csv writer (DictWriter), which discards additional information (beyond the specified fields) and blanks missing information. Also, should the order of the information on the original page change, this solution is unaffected.
I also assume you are going to open the CSV file in something like Excel. Additional parameters need to be given to the csv writer for this to happen nicely (see dialect parameter). Given that we are not sanitising the returned data, we should explicitly delimit it with unconditional quoting (see quoting parameter).
import csv
import requests
from bs4 import BeautifulSoup
#Read text file
with open("City.txt") as cities_h:
cities = cities_h.readlines()
home_city = "Phoenix"
city_data = []
for city in cities:
url = "http://www.expatistan.com/cost-of-living/comparison/%s/%s" % (home_city, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding = resp.encoding)
titles = soup.select("table.comparison tr.expandable")
if titles:
data = {}
for title in titles:
name = title.find("th", class_ = "clickable")
diff = title.find("th", class_ = "percent")
exp = bool(diff.find("span", class_ = "expensiver"))
data[name.text] = ("+" if exp else "-") + diff.span.text
data["City"] = soup.find("strong", class_ = "city-2").text
city_data.append(data)
with open("Expatistan.csv","w") as csv_h:
fields = \
[
"City",
"Food",
"Housing",
"Clothes",
"Transportation",
"Personal Care",
"Entertainment"
]
#Prepare CSV writer.
writer = csv.DictWriter\
(
csv_h,
fields,
quoting = csv.QUOTE_ALL,
extrasaction = "ignore",
dialect = "excel",
lineterminator = "\n",
)
writer.writeheader()
writer.writerows(city_data)

Categories