UnicodeDecodeError upon reading xls files - python

I'm trying to read in and extract information from many excel files in XLS format using python. When I run my code, I encounter the following Warnings and Error:
WARNING *** file size (89002) not 512 + multiple of sector size (512)
WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x20 in position 108: truncated data
What's funny is that once I open the file manually and then run the code, the code will execute just fine.
Since there are about 500 files in the folder, I'd like to find out the cause of the error so that I can automate the process without having to open every single file. Any help would be appreciated!
(Below is an example of the type of xls file)
https://www.dropbox.com/s/w2r8br0nblbbr0x/A1-1a105800.XLS?dl=1
data_year = 2007
path = 'C:/Users/hard1/Desktop/CRA/' + str(data_year)
filenames = []
#count = 0
for filename in glob.glob(os.path.join(path, '*.xls')):
#print(filename)
#count = count+1
filenames.append(filename)
#print(count)
respondent_id = []
bank_name = []
loan_amount = []
state = []
year = []
for filename in filenames:
print(filename)
# wb = xlrd.open_workbook(filename, encoding_override="utf_16_le")
wb = xlrd.open_workbook(filename)
sheet = wb.sheet_by_index(0)
# Column M index is
msa_string = sheet.cell(2, 12).value
state_string = msa_string[len(msa_string)-2 : len(msa_string)]
col_id = sheet.col_values(5)
col_bank = sheet.col_values(0)
col_loan = sheet.col_values(23)
### And then code that extracts information from the files follows

Related

How can I loop a pathway through a function that is only taking raw strings?

I am currently writing a script that generates a report (output is .csv) on directory contents. Each report is unique in that it saves with unique date/timestamp, so the report doesn't save over itself or append to the same file each time.
The column headers in the report are as follows;
header = ['File_Pathway', 'Subdir', 'File_Name', 'Extension', 'Size_(in_bytes)', 'File_Created', 'Last_File_Save_Date', 'File_Author', 'Last_Saved_By_User_X']
I am struggling to get the File_Author and Last_Saved_By_User_X, but found a script here that collects this information using file metadata:
import win32com.client
sh=win32com.client.gencache.EnsureDispatch('Shell.Application',0)
ns = sh.NameSpace(r'm:\music\Aerosmith\Classics Live!')
colnum = 0
columns = []
while True:
colname=ns.GetDetailsOf(None, colnum)
if not colname:
break
columns.append(colname)
colnum += 1
for item in ns.Items():
print (item.Path)
for colnum in range(len(columns)):
colval=ns.GetDetailsOf(item, colnum)
if colval:
print('\t', columns[colnum], colval)
The issue I run into is with ns = sh.NameSpace(r'm:\music\Aerosmith\Classics Live!') as it only takes raw strings. The pathway that I want to pass to sh.NameSpace is a variable that loops through the directory, it's the current_filepath as the script is looping through the directory of files.
I have tried every method from this article to convert the string variable into a raw string to pass through this function but nothing is working. Can anyone help shed some light on this for me?
For more context, here is some more sample code from the script I am writing to show you what the current_filepath variable is:
rootdir = input('Enter directory pathway: ')
count = 0
datetime_for_filename = datetime.now()
datetime_for_filename_format = str(datetime.strftime(datetime_for_filename, '%Y-%m-%d--%H-%M-%S'))
filename_with_datetimestamp = 'filename_printout' + '-' + datetime_for_filename_format + '.csv'
header = ['File_Pathway', 'Subdir', 'File_Name', 'Extension', 'Size_(in_bytes)', 'File_Created', 'Last_File_Save_Date', 'File_Author', 'Last_Saved_By_User_X']
for subdir, dirs, files in os.walk(rootdir):
with open(filename_with_datetimestamp, 'a', newline='') as f:
writer = csv.writer(f)
current_subdir = subdir
try:
for filenames in files:
data_list = []
current_filepath = subdir + '\\''' + filenames
raw_current_filepath = fr"{current_filepath}"

Taking Same Worksheet from a Folder of xlsm Files with Python

I'm new to pandas/python and Ive come up with the following code to extract data from a specific part of a worksheet.
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
I have the same worksheet (summary results) across a library of 60 xlsm files and I'm having a hard time figuring out how to iterate this across the entire folder of files. I also want change this from extracting specific rows to taking the entire "Summary" worksheet, pasting it to the new file and naming the worksheet by its filename ("Experiment_A") when pasted to the new excel file. Any advice?
I was having hard time to read your code to understand that what you want to do finally. So it is just an advice not a solution. You can iterate through all files in the folder using os then read the files in to one dataframe then save the single big data frame in to csv. I usually avoid excel but I guess you need the excel conversion. In the example below I have read all txt file from a directory put them in to dataframe list then store the big data frame as json. You can also store it as excel/csv.
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()

Iterate write row value to excel python, what's wrong with my code?

I want to write root's title value to the excel column A, my code:
from openpyxl import Workbook
import os
path = "C:/path_to_folder"
#word = '<option value="1.2.0-b.1" key="#SSPVersion#"/>'
os.chdir(path) #change directroy to application notes folder
titlelist = []
for root, dirs, files in os.walk(path):
title = str(root.split("/")[-1])
titlelist.append(title)
wb = Workbook()
ws = wb.active
r=2
for t in titlelist:
ws.cell(row=r, column = 1).value = str(t)
r += 1
wb.save("row_creation_loop.xlsx")
This does not work...always shows Error :
traceback(most recent call last):
ws[column_cell+str(row+2)] = stri(i)
self[key].value = value self._bind_value(value)
value = self.check_string(value)
value = unicode(value, self.encoding)
unicodeDecodeError: 'utf8' codec can't decode byte 0*92 in position 17: invalid start byte
Just posting some thought here: This code here (which is a copy of yours without reading in titles works just fine):
from openpyxl import Workbook
titlelist = ["title1"]
wb = Workbook()
ws = wb.active
for ind,t in enumerate(titlelist):
ws.cell(row= ind+2, column = 1).value = str(t)
wb.save("row_creation_loop.xlsx")
So the issue here is your titlelist which contains characters that can't be encoded in utf-8. We need to fix that, probably by using some decode and encode.
Share that list with us.

Odoo8.0, cannot valid the csv file when try to import it

I have problem when i was try to import .csv file. I was try to convert image to base64 and also i was try to create barcode by name csv file. The image it's success convert into base64 but the problem when i was try to create barcode by csv file name, i was always get error like :
Unknown error during import: <class 'openerp.exceptions.ValidationError'>: ('ValidateError', u'Field(s) `ean13` failed against a constraint: You provided an invalid "EAN13 Barcode" reference. You may use the "Internal Reference" field instead.') at row 2 Resolve other errors first
And this is my code:
files = []
text = ''"
data_text3 = []
header_column2 = ["id","product_variant_ids/ean13_barcode", "product_variant_ids/ean13", "ean13", "image", "ean13_barcode", "default_code", "product_variant_ids/default_code"]
number = 1 for file in os.listdir("gmbr/"):
file_name = os.path.splitext(file)[0]
for n in str(number):
directory_file = "gmbr/"+str(file)
img = open(directory_file, 'rb').read()
img_64 = base64.encodestr
text = str(number)+","+str(name_product)+","+str(file_name)+","+str(file_name)+","+str(img_64+","+" "+","+" "+","+" ")
number += 1
data_text3.append(text)
with open('sample2.csv', 'wb') as f:
writer = csv.writer(f, delimiter='\t', dialect='excel')
writer.writerow(header_column2)
for l in data_text3:
writer.writerow(l.split(','))

Merge files into xlsx and then reconstruct the dir

I have many files ('*.pl-pl'). My script has to find each of this files and merge them into one xlsx file using openpyxl.
Now, I want to rebuild those files, I want rebuild the same files as originals.
But there is a problem after writing:
(content variable contains content of one file (read from one excel cell))
with open(path,'w') as f:
f.write(content.encode('utf-8'))
So now, I check, whether original files are the same as new files. Text in those files seems to be the same but there are little differencies in size. When I use WinDiff application to check them, it finds some touples which are different but it says that they are different in blanks only.
Could you give me an advice how to rebuild those files to be the same as before?
Or is this way correct?
Note: I try to rebuild them to be sure that there will be the same encoding etc. because the merged excel file will be used to translation and then translated files has to be rebuilt instead of originals.
Here is the code - it checks directory and prints all file names and contents into the one temporary file. Then, it creates an excel file - 1st. column is path (to be able reconstruct dir) and 2nd column contains content of the file, where new lines has been switched to '='
def print_to_file():
import os
for root, dirs, files in os.walk("OriginalDir"):
for file in files:
text = []
if file.endswith(".pl-pl"):
abs_path = os.path.join(root, file)
with open(abs_path) as f:
for line in f:
text.append(line.strip('\n'))
mLib.printToFile('files.mdoc', abs_path + '::' + '*=*'.join(text)) #'*=*' represents '\n'
def write_it():
from openpyxl import Workbook
import xlsxwriter
file = 'files.mdoc'
workbook = Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = "Translate"
i = 0
with open(file) as f:
classes = set()
for line in f:
i += 1
splitted = line.strip('\n').split('::')
name = splitted[0]
text = splitted[1].split('*=*')
text = [x.encode('string-escape') for x in text]
worksheet.cell('B{}'.format(i)).style.alignment.wrap_text = True
worksheet.cell('B{}'.format(i)).value = splitted[1]
worksheet.cell('A{}'.format(i)).value = splitted[0]
workbook.save('wrap_text1.xlsx')
import openpyxl
def rebuild():
wb = openpyxl.load_workbook('wrap_text1.xlsx')
ws = wb.worksheets[0]
row_count = ws.get_highest_row()
for i in xrange(1, row_count + 1):
dir_file = ws.cell('A{}'.format(i)).value
content = ws.cell('B{}'.format(i)).value
remake(dir_file, content)
import os
def remake(path, content):
content = re.sub('\*=\*', '\n', content)
result = ''
splt = path.split('\\')
file = splt[-1]
for dir in splt[:-1]:
result += dir + '/'
# print result
if not os.path.isdir(result):
# print result
os.mkdir(result)
with open(path, 'w') as f:
f.write(content.encode('utf-8'))
# print_to_file() # print to temp file - paths and contents separated by '::'
# write_it() # write it into the excel file
# rebuilt() # reconstruct directory

Categories