python read excel XML to DataFrame - python

i'm reading a 20000 x 16 table from excel XML file. I'm using this function found on GitHub and it works but really slow. Any way to speed it up?
import pandas
from bs4 import BeautifulSoup
source = 'Positions_20171110.xls'
def read_excel_xml(path):
""" Converts Excel XML to a [[[list]]] """
file = open(path).read()
soup = BeautifulSoup(file,'xml')
workbook = []
for sheet in soup.findAll('Worksheet'):
sheet_as_list = []
for row in sheet.findAll('Row'):
row_as_list = []
for cell in row.findAll('Cell'):
row_as_list.append(cell.Data.text)
sheet_as_list.append(row_as_list)
workbook.append(sheet_as_list)
return workbook
data = read_excel_xml(source)
df = pd.DataFrame(data[0][1:],columns = data[0][0])

Related

Extracting Data from Multiple PDFs'

I am trying to extract data from PDF document and have regarding that - I was able to get the code working for one single PDF. However, is there a way I can point the code to a folder with multiple PDF's and get the extract out in CSV? I am a complete beginner in Python, so any help will be appreciated. Below is the current code that I have.
import pdfplumber
import pandas as pd
file = 'Test Slip.pdf'
lines = []
with pdfplumber.open(file) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
lines.append(line)
print(line)
df = pd.DataFrame(lines)
df.to_csv('test.csv')
One possible option would be to use os.listdir and only read files that end in .pdf:
import os
folder_with_pdfs = '/path/to/folder'
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
# do pdf reading with opening pdf_file_path
I am not sure why you aim to write lines to a dataframe as rows but this should be what you need:
import pdfplumber
import pandas as pd
import os
def extract_pdf(pdf_path):
linesOfFile = []
with pdfplumber.open(pdf_path) as pdf:
for pdf_page in pdf.pages:
single_page_text = pdf_page.extract_text()
for linesOfFile in single_page_text.split('\n'):
linesOfFile.append(line)
#print(linesOfFile)
return linesOfFile
folder_with_pdfs = 'folder_path'
linesOfFiles = []
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
linesOfFile = extract_pdf(pdf_file_path)
linesOfFiles.append(linesOfFile)
df = pd.DataFrame(linesOfFiles)
df.to_csv('test.csv')

In Python what is the best way to read a pdf table with no outline?

I am trying to read data from a table in a pdf into a pandas dataframe. I am able to do so using tabula-py when the pdf has outlines around the table, but when I try on the pdf without an outline the script produces an error.
For example, I am looking at the pdfs available from two different urls. I have downloaded the pdfs from the urls and saved them as 'JSE Opts.pdf' and 'JSE Divs.pdf' respectively.
import requests
import pandas as pd
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Dividends/ED_DividendsReport.pdf'
response = requests.get(url)
fname = 'JSE Divs.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Options%20Daily%20Traded%20Report/ED_OptionsDailyTradedReport.pdf'
response = requests.get(url)
fname = 'JSE Opts.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
I am able to read the 'JSE Opts.pdf' into a pandas dataframe using the code:
import tabula as tb
pdf = './JSE Opts.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
When I try to do the same for 'JSE Divs.pdf', I get errors and tabula-py is only able to read the header:
pdf = './JSE Divs.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
I suspect that this is because there are no lines around the table. If that is the case, what is the best way to go about reading the data from 'JSE Divs.pdf' into pandas?
I was able to read the data into a string using pdfplumber, save the string as a CSV file (after cleaning the data to suit my needs) and then import into pandas.
import pdfplumber
pdf = pdfplumber.open("./JSE Divs.pdf")
text = ''
i = 0
while True:
try:
text += pdf.pages[i].extract_text() + '\n'
i = i+1
except IndexError:
break
for replace_s in [' DN',' CA1',' ANY',' CSH',' PHY',' QUANTO']:
text = text.replace(replace_s,'')
while True:
try:
idx = text.index('EXO')
replace_s =text[idx-1:idx+8]
text = text.replace(replace_s,'')
except ValueError:
break
cols ='EXPIRY_s,USYM,EXPIRY,EX_DATE,CUM_PV_DIVS,CUM_DIVS,ISIN,INSTR_ID\n'
text = text[text.index('Div\n')+4:]
text = cols + text
text = text.replace(' ',',')
f = open('divs.csv','w')
f.write(text)
f.close()

How to write csv file in html?

I have read file of csv but I have a problem that how to read CSV file and save it in table.html?
import csv
html_about = ''
names = []
with open('filo.csv') as data_file:
csv_data = csv.reader(data_file)
for line in csv_data:
names.append(f'{line[0]}')
html_output = '\n<ul>'
for name in names:
html_output += f'\n\t<li>{name}</li>'
html_output += '\n</ul>'
from prettytable import PrettyTable
x = PrettyTable(line[0])
html_code = x.get_html_string()
html_file = open('table.html','w')
html_file = html_file.write(html_code)
I suggest you use pandas library,
it has pd.read_csv, and also pd.to_html
usage should look like this, let me know if this works for you:
import pandas as pd
df = pd.read_csv('filo.csv')
with open('table.html', 'w') as html_file:
df.to_html(html_file)

python: get full formula from excel, using xlrd

I'm trying to get the full formula from Excel file
I tried many ways, but all get for me the value
I need the full formula that is in the cell, not the value itself
I'm using python with xlrd
is there any function I can use ?
or is there anyway to ?
Thanks alot
So I know this is a very old post, but I found a decent way of getting the formulas from all the sheets in a workbook as well as having the newly created workbook retain all the formatting.
First step is to save a copy of your .xlsx file as .xls
-- Use the .xls as the filename in the code below
Using Python 2.7
from lxml import etree
from StringIO import StringIO
import xlsxwriter
import subprocess
from xlrd import open_workbook
from xlutils.copy import copy
from xlsxwriter.utility import xl_cell_to_rowcol
import os
file_name = '<YOUR-FILE-HERE>'
dir_path = os.path.dirname(os.path.realpath(file_name))
subprocess.call(["unzip",str(file_name+"x"),"-d","file_xml"])
xml_sheet_names = dict()
with open_workbook(file_name,formatting_info=True) as rb:
wb = copy(rb)
workbook_names_list = rb.sheet_names()
for i,name in enumerate(workbook_names_list):
xml_sheet_names[name] = "sheet"+str(i+1)
sheet_formulas = dict()
for i, k in enumerate(workbook_names_list):
xmlFile = os.path.join(dir_path,"file_xml/xl/worksheets/{}.xml".format(xml_sheet_names[k]))
with open(xmlFile) as f:
xml = f.read()
tree = etree.parse(StringIO(xml))
context = etree.iterparse(StringIO(xml))
sheet_formulas[k] = dict()
for _, elem in context:
if elem.tag.split("}")[1]=='f':
cell_key = elem.getparent().get(key="r")
cell_formula = elem.text
sheet_formulas[k][cell_key] = str("="+cell_formula)
sheet_formulas
Structure of Dictionary 'sheet_formulas'
{'Worksheet_Name': {'A1_cell_reference':'cell_formula'}}
Example results:
{u'CY16': {'A1': '=Data!B5',
'B1': '=Data!B1',
'B10': '=IFERROR(Data!B12,"")',
'B11': '=IFERROR(SUM(B9:B10),"")',

Scraping values from HTML header and saving as a CSV file in Python

All,
I've just started using Python (v 2.7.1) and one of my first programs is trying to scrape information from a website containing power station data using the Standard Library and BeautifulSoup to handle the HTML elements.
The data I'd like to access is obtainable in either the 'Head' section of the HTML or as tables within the main body. The website will generate a CSV file from it data if the CSV link is clicked.
Using a couple of sources on this website I've managed to cobble together the code below which will pull the data out and save it to a file, but, it contains the \n designators. Try as I might, I can't get a correct CSV file to save out.
I am sure it's something simple but need a bit of help if possible!
from BeautifulSoup import BeautifulSoup
import urllib2,string,csv,sys,os
from string import replace
bm_url = 'http://www.bmreports.com/servlet/com.logica.neta.bwp_PanBMDataServlet?param1=T_COTPS-4&param2=&param3=&param4=&param5=2011-02-05&param6=*'
data = urllib2.urlopen(bm_url).read()
soup = BeautifulSoup(data)
data = str(soup.findAll('head',limit=1))
data = replace(data,'[<head>','')
data = replace(data,'<script language="JavaScript" src="/bwx_generic.js"></script>','')
data = replace(data,'<link rel="stylesheet" type="text/css" href="/bwx_style.css" />','')
data = replace(data,'<title>Historic Physical Balancing Mechanism Data</title>','')
data = replace(data,'<script language="JavaScript">','')
data = replace(data,' </script>','')
data = replace(data,'</head>]','')
data = replace(data,'var gs_csv=','')
data = replace(data,'"','')
data = replace(data,"'",'')
data = data.strip()
file_location = 'c:/temp/'
file_name = file_location + 'DataExtract.txt'
file = open(file_name,"wb")
file.write(data)
file.close()
Don't turn it back into a string and then use replace. That completely defeats the point of using BeautifulSoup!
Try starting like this:
scripttag = soup.head.findAll("script")[1]
javascriptdata = scripttag.contents[0]
Then you can use:
partition('=')[2] to cut off the "var gs_csv" bit.
strip(' \n"') to remove unwanted characters at each end (space, newline, ")
replace("\\n","\n") to sort out the new lines.
Incidentally, replace is a string method, so you don't have to import it separately, you can just do data.replace(....
Finally, you need to separate it as csv. You could save it and reopen it, then load it into a csv.reader. You could use the StringIO module to turn it into something you can feed directly to csv.reader (i.e. without saving a file first). But I think this data is simple enough that you can get away with doing:
for line in data.splitlines():
row = line.split(",")
SOLUTION
from BeautifulSoup import BeautifulSoup
import urllib2,string,csv,sys,os,time
bm_url_stem = "http://www.bmreports.com/servlet/com.logica.neta.bwp_PanBMDataServlet?param1="
bm_station = "T_COTPS-3"
bm_param = "&param2=&param3=&param4=&param5="
bm_date = "2011-02-04"
bm_param6 = "&param6=*"
bm_full_url = bm_url_stem + bm_station + bm_param + bm_date + bm_param6
data = urllib2.urlopen(bm_full_url).read()
soup = BeautifulSoup(data)
scripttag = soup.head.findAll("script")[1]
javascriptdata = scripttag.contents[0]
javascriptdata = javascriptdata.partition('=')[2]
javascriptdata = javascriptdata.strip(' \n"')
javascriptdata = javascriptdata.replace("\\n","\n")
javascriptdata = javascriptdata.strip()
csvwriter = csv.writer(file("c:/temp/" + bm_station + "_" + bm_date + ".csv", "wb"))
for line in javascriptdata.splitlines():
row = line.split(",")
csvwriter.writerow(row)
del csvwriter

Categories