Show excel data in only one Sheet

Show excel data in only one Sheet - python

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

Related

Hitting Error while iterating the row under While loop followed by If Else condition for Pandas Dataframe

With my current task I am trying to iterate all of my excel rows and then copy each row as a new record to the destination template file. the source file and the destination files are different hence I am assigning the value to the destination cell.
# code run sample file
# import os
import pandas as pd
import xlwings as xw
newdir_path = " "
file1 = "list.xlsx"
# read the source file
data = pd.read_excel(file1, sheet_name=0, header=0,
index_col=False, keep_default_na=True)
# creating pandas dataframe from the source file
df = pd.DataFrame(data, columns=['Funktion', 'AD65', 'W70', 'B14', 'AC21'])
B14 = ['RR', 'BB', 'RA', 'MM']
booleans = []
# to iterate all the rows
for i in df.itertuples(index=True):
# only to read the rows where column AD65(2nd column) does not have blank value or none
while i[AD65] != 'None':
# to retrieve the row values
# when B14 value is RR
if B14 == 'RR':
print(i)
# retrieving the values
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
# when B14 is B1
elif B14 == 'BB':
print(i)
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
elif B14 == 'RA':
# repeating the same as above
booleans.append(True)
elif B14 == 'MM':
booleans.append(True)
else:
# I want to skip the rows when B14 value is blank
Note: I also tried my condition with if df.loc[['B14'] == 'RR']: and it is throwing raise KeyError(key)
KeyError: False

openpyxl read Table to python

I'm trying to read an excel Table object into python and can't find any syntax for doing so.
It would be useful to read a whole table into e.g. a dict of dicts (I'm trying not to pull in pandas as a dependency for this particular project).
I can't find any way of doing this.

Below code will read through the table row by row, Also you can specify the range
import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
sheet['A1':'B7']
for i1,i2 in sheet:
print("{0:8} {1:8}".format(i1.value,i2.value))
Output:
Student_name Marks
Tony Stark 47
Loki 59
Oddin 73
Nick Fury 62
Samaul 75
Peter Parkar 80

I got it working as follows:
import openpyxl
def all_tables_data(filename: str) -> dict:
"""
Get values for all tables in a spreadsheet.
Returns a dict of tables, keyed by table name.
Table values are given as lists of lists.
"""
workbook = openpyxl.load_workbook(filename)
tables_by_name = {}
table_worksheets = {}
for worksheet in workbook.worksheets:
for table in worksheet._tables:
tables_by_name[table.name] = table
table_worksheets[table.name] = worksheet
def get_vals(table_name: str) -> list:
worksheet = table_worksheets[table_name]
cell_range = worksheet[tables_by_name[table_name].ref]
return [[cell.value for cell in row] for row in cell_range]
return {table_name: get_vals(table_name) for table_name in tables_by_name}
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
TABLES = all_tables_data(FILENAME)
print(TABLES)
I'm interested in converting tables to dicts.
Often the rows/entries in a table may have >1 key.
Assuming the table has a header row, I also cobbled the code below together.
It can be used to convert a table to a dict of rows, with each row being a dict of values, keyed by the column headers.
class WorkbookWithTables:
def __init__(self, workbook):
self.workbook = workbook
self.tables_by_name = {}
self.table_worksheets = {}
for worksheet in self.workbook.worksheets:
for table in worksheet._tables:
self.tables_by_name[table.name] = table
self.table_worksheets[table.name] = worksheet
#classmethod
def from_file(cls, filename):
_workbook = openpyxl.load_workbook(filename)
return cls(_workbook)
def table_to_dict(self, table_name, n_keys=1):
worksheet = self.table_worksheets[table_name]
cell_range = worksheet[self.tables_by_name[table_name].ref]
table_dict = {}
value_headers = tuple(cell.value for cell in cell_range[0][n_keys:])
for row in cell_range[1:]:
if n_keys == 1:
key = row[0].value
else:
key = tuple(partial_key.value for partial_key in row[:n_keys])
values = {
value_headers[i]: cell.value
for i, cell in enumerate(row[n_keys:])
}
table_dict[key] = values
return table_dict
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
WB = WorkbookWithTables.from_file(FILENAME)
MY_SINGLE_KEYED_TABLE = WB.table_to_dict("my_single_keyed_table")
MY_DOUBLE_KEYED_TABLE = WB.table_to_dict("my_double_keyed_table", 2)

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')

I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

Using pandas drop duplicates but doesn't correctly drop the duplicates

First of all, I'm not sure whether it is drop_duplicates() fault or not.
What I want to do:
Import file from csv, do a re.search on every row, if match, keep the row inside a dictionary, if doesn't match, keep the row inside another dictionary. Make a graph out of the length of the dictionary value.
The problem
I have 1000 rows inside csv, but the result returns 1200.
My code
import pandas as pd
import re
# import data
filename = 'sample.csv'
# save data as data
data = pd.read_csv(filename, encoding='utf-8')
# create new dictionary for word that is true and false
# but doesn't have the keyword in items
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# if text is True
def word_in_text(word,text,label):
match = re.search(word,text)
if match and label == True:
kaiT.setdefault('text', []).append(text)
elif match and label == False:
kaiF.setdefault('text', []).append(text)
elif label == True and not match:
wordNT.setdefault('text', []).append(text)
elif label == False and not match:
wordNF.setdefault('text', []).append(text)
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
When I removed the line
word_in_text('bar', row['text'], row['label'])
and only keep
word_in_text('foo', row['text'], row['label'])
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount) returns 1000 correctly, and vice versa.
But when I don't, it returns 1200 when it should only be 1000?
CSV INPUT sample
text,label
"hey", TRUE
"halo", FALSE
"How are you?", TRUE
EXPECTED OUTPUT
1000
OUTPUT
1200

In the function word_in_text, you update the four dict: wordNT, wordNF, kaiT and kaiF.
And you call word_in_text twice while iterating the dataframe:
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
So the searching result is the mix of the result from 'foo' and result from 'bar'.
Instead, you should clean up the four dict before starting a new search:
def search(text):
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# iterate every text in data
for index, row in data.iterrows():
word_in_text(text, row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
search('foo')
search('bar')

openpyxl read tables from existing data book example?

In the openpyxl documentation there is an example of how to place a table into a workbook but there are no examples of how to find back the tables of a workbook. I have an XLS file that has named tables in it and I want to open the file, find all of the tables and parse them. I cannot find any documentation on how to do this. Can anyone help?
In the meantime I worked it out and wrote the following class to work with openpyxl:
class NamedArray(object):
''' Excel Named range object
Reproduces the named range feature of Microsoft Excel
Assumes a definition in the form <Worksheet PinList!$A$6:$A$52 provided by openpyxl
Written for use with, and initialised by the get_names function
After initialisation named array can be used in the same way as for VBA in excel
Written for openpyxl version 2.4.1, may not work with earlier versions
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, wb, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given workbook
'''
self.sheet, cellrange_str = str(named_range_raw).split('!')
self.sheet = self.sheet.replace("'",'') # remove the single quotes if they exist
self.loc = wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo, hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def value(self, row=1, col=1):
''' Returns the value at row, col
'''
assert row <= self.rows , 'invalid row number given'
assert col <= self.cols , 'invalid column number given'
return self.loc.cell(self.n_to_col(self.min_col + col-1)+str(self.row + row-1)).value
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
def __contains__(self, val):
rval = False
for row in range(1,self.rows+1):
for col in range(1,self.cols+1):
if self.value(row,col) == val:
rval = True
return rval
def vlookup(self, key, col):
''' excel style vlookup function
'''
assert col <= self.cols , 'invalid column number given'
rval = None
for row in range(1,self.rows+1):
if self.value(row,1) == key:
rval = self.value(row, col)
break
return rval
def hlookup(self, key, row):
''' excel style hlookup function
'''
assert row <= self.rows , 'invalid row number given'
rval = None
for col in range(1,self.cols+1):
if self.value(1,col) == key:
rval = self.value(row, col)
break
return rval
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls, n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
def get_names(workbook, filt='', debug=False):
''' Create a structure containing all of the names in the given workbook
filt is an optional parameter and used to create a subset of names starting with filt
useful for IO_ring_spreadsheet as all names start with 'n_'
if present, filt characters are stipped off the front of the name
'''
named_ranges = workbook.defined_names.definedName
name_list = {}
for named_range in named_ranges:
name = named_range.name
if named_range.attr_text.startswith('#REF'):
print('WARNING: named range "', name, '" is undefined')
elif filt == '' or name.startswith(filt):
name_list[name[len(filt):]] = NamedArray(workbook, named_range.attr_text)
if debug:
with open("H:\\names.txt",'w') as log:
for item in name_list:
print (item, '=', name_list[item])
log.write(item.ljust(30) + ' = ' + str(name_list[item])+'\n')
return name_list

I agree that the documentation does not really help, and the public API also seems to have only add_table() method.
But then I found an openpyxl Issue 844 asking for a better interface, and it shows that worksheet has an _tables property.
This is enough to get a list of all tables in a file, together with some basic properties:
from openpyxl import load_workbook
wb = load_workbook(filename = 'test.xlsx')
for ws in wb.worksheets:
print("Worksheet %s include %d tables:" % (ws.title, len(ws._tables)))
for tbl in ws._tables:
print(" : " + tbl.displayName)
print(" - name = " + tbl.name)
print(" - type = " + (tbl.tableType if isinstance(tbl.tableType, str) else 'n/a')
print(" - range = " + tbl.ref)
print(" - #cols = %d" % len(tbl.tableColumns))
for col in tbl.tableColumns:
print(" : " + col.name)
Note that the if/else construct is required for the tableType, since it can return NoneType (for standard tables), which is not convertible to str.

Building on #MichalKaut's answer, I created a simple function that returns a dictionary with all tables in a given workbook. It also puts each table's data into a Pandas DataFrame.
from openpyxl import load_workbook
import pandas as pd
def get_all_tables(filename):
""" Get all tables from a given workbook. Returns a dictionary of tables.
Requires a filename, which includes the file path and filename. """
# Load the workbook, from the filename, setting read_only to False
wb = load_workbook(filename=file, read_only=False, keep_vba=False, data_only=True, keep_links=False)
# Initialize the dictionary of tables
tables_dict = {}
# Go through each worksheet in the workbook
for ws_name in wb.sheetnames:
print("")
print(f"worksheet name: {ws_name}")
ws = wb[ws_name]
print(f"tables in worksheet: {len(ws.tables)}")
# Get each table in the worksheet
for tbl in ws.tables.values():
print(f"table name: {tbl.name}")
# First, add some info about the table to the dictionary
tables_dict[tbl.name] = {
'table_name': tbl.name,
'worksheet': ws_name,
'num_cols': len(tbl.tableColumns),
'table_range': tbl.ref}
# Grab the 'data' from the table
data = ws[tbl.ref]
# Now convert the table 'data' to a Pandas DataFrame
# First get a list of all rows, including the first header row
rows_list = []
for row in data:
# Get a list of all columns in each row
cols = []
for col in row:
cols.append(col.value)
rows_list.append(cols)
# Create a pandas dataframe from the rows_list.
# The first row is the column names
df = pd.DataFrame(data=rows_list[1:], index=None, columns=rows_list[0])
# Add the dataframe to the dictionary of tables
tables_dict[tbl.name]['dataframe'] = df
return tables_dict
# File location:
file = r"C:\Users\sean\spreadsheets\full_of_tables.xlsx"
# Run the function to return a dictionary of all tables in the Excel workbook
tables_dict = get_all_tables(filename=file)

The answer to this has changed.
ws objects now contain the tables accessor which acts as a dictionary. Updated answer is:
tmp = [ws.tables for ws in wb.worksheets]
tbls = [{v.name:v} for t in tmp for v in t.values()]

I'm not sure what you mean by parsing but read-support for worksheet tables has been possible since version 2.4.4. If you have questions about the details then I suggest you ask your question on the openpyxl mailing list as that is a more suitable place for this kind of discussion.

I don't think this is possible. I seems to work similarly to images; if you read and save a file with a table it will get striped.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Show excel data in only one Sheet - python

Related

Hitting Error while iterating the row under While loop followed by If Else condition for Pandas Dataframe

openpyxl read Table to python

How to convert text table to dataframe

Using pandas drop duplicates but doesn't correctly drop the duplicates

openpyxl read tables from existing data book example?

Categories

Resources