'utf-8' codec can't decode byte 0x8b - python

There are several folders (called DT_20180102, DT_20180103, ...) in ComputedTEsCsv folder. In each DT_... folder, there are 498 CSV files. I want to store these into a dictionary and store it in a pickle.
I write the code below, but it raises an error:
'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
How can I correct this?
# Directory containing Joined Datasets of all companies.
_dir = "/Users/admin/Desktop/TransferEntropyEarningsAnnouncements/SP500Data/ComputedTEsCsv/"
# Create Directory names
#dates = []#['DT_20180201','DT_20180202']
dates = [i for i in os.listdir(_dir) if 'DT' in i]#['DT_20180201','DT_20180202']
# Create/Populate dictionary to contain all network data
network_dfs = {}
for _date in dates:
network_dfs[_date] = {}
load_pickle = False # Process to read in data is costly. Set to True to read in from pickle file
p_path = "SP500Data/NetworkJoinAll.pickle" # Save all files here ...
#if load_pickle is not True:
for date in tqdm(dates, total=len(dates), desc='JoiningAllNetworkDates'):
try:
base_path = "{0}{1}/".format(_dir, date)
company_files = os.listdir(base_path)
if '.ipynb_checkpoints' in company_files:
company_files.remove('.ipynb_checkpoints')
if '.rda' in company_files:
company_files.remove('.rda')
for i, company_file in enumerate(company_files):#tqdm(enumerate(company_files), total=len(company_files)):
# Only read in 1st 34 columns with 2hr 10 min periods
tmp_df = pd.read_csv(base_path+company_file)
if i == 0:
network_dfs[date] = tmp_df
else:
network_dfs[date] = pd.concat([network_dfs[date], tmp_df], ignore_index=True)
# Clean Data set any negative TE values to nan.
for col in network_dfs[date].columns[3:]:
network_dfs[date][network_dfs[date][col] < 0][col] = np.nan
except FileNotFoundError:
pass
print('Writing Network Data from {0}'.format(p_path))
with open(p_path, 'wb') as f:
pickle.dump(network_dfs, f, pickle.HIGHEST_PROTOCOL)
print('Done.')

Related

UnicodeDecodeError upon reading xls files

I'm trying to read in and extract information from many excel files in XLS format using python. When I run my code, I encounter the following Warnings and Error:
WARNING *** file size (89002) not 512 + multiple of sector size (512)
WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x20 in position 108: truncated data
What's funny is that once I open the file manually and then run the code, the code will execute just fine.
Since there are about 500 files in the folder, I'd like to find out the cause of the error so that I can automate the process without having to open every single file. Any help would be appreciated!
(Below is an example of the type of xls file)
https://www.dropbox.com/s/w2r8br0nblbbr0x/A1-1a105800.XLS?dl=1
data_year = 2007
path = 'C:/Users/hard1/Desktop/CRA/' + str(data_year)
filenames = []
#count = 0
for filename in glob.glob(os.path.join(path, '*.xls')):
#print(filename)
#count = count+1
filenames.append(filename)
#print(count)
respondent_id = []
bank_name = []
loan_amount = []
state = []
year = []
for filename in filenames:
print(filename)
# wb = xlrd.open_workbook(filename, encoding_override="utf_16_le")
wb = xlrd.open_workbook(filename)
sheet = wb.sheet_by_index(0)
# Column M index is
msa_string = sheet.cell(2, 12).value
state_string = msa_string[len(msa_string)-2 : len(msa_string)]
col_id = sheet.col_values(5)
col_bank = sheet.col_values(0)
col_loan = sheet.col_values(23)
### And then code that extracts information from the files follows

Problems opening DBF files in python

I am trying to open en transform several DBF files to a dataframe. Most of them worked fine, but for one of the files I receive the error:
"UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 15: invalid start byte"
I have read this error on some other topics such as opening csv and xlsx and other files. The proposed solution was to include encoding = 'utf-8'
in the reading the file part. I haven't found a solution for DBF files unfortunately and I have very limited knowledge on DBF files.
What I have tried so far:
1)
from dbfread import DBF
dbf = DBF('file.DBF')
dbf = pd.DataFrame(dbf)
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 8: character maps to <undefined>
2)
from simpledbf import Dbf5
dbf = Dbf5('file.DBF')
dbf = dbf.to_dataframe()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 15: invalid start byte
3)
# this block of code copied from https://gist.github.com/ryan-hill/f90b1c68f60d12baea81
import pysal as ps
def dbf2DF(dbfile, upper=True): #Reads in DBF files and returns Pandas DF
db = ps.table(dbfile) #Pysal to open DBF
d = {col: db.by_col(col) for col in db.header} #Convert dbf to dictionary
#pandasDF = pd.DataFrame(db[:]) #Convert to Pandas DF
pandasDF = pd.DataFrame(d) #Convert to Pandas DF
if upper == True: #Make columns uppercase if wanted
pandasDF.columns = map(str.upper, db.header)
db.close()
return pandasDF
dfb = dbf2DF('file.DBF')
AttributeError: module 'pysal' has no attribute 'open'
And last, if I try to install the dbfpy module, I receive:
SyntaxError: invalid syntax
Any suggestions on how to solve this?
Try using my dbf library:
import dbf
table = dbf.Table('file.DBF')
Print it to see if an encoding is present in the file:
print table # print(table) in Python 3
One of my test tables looks like this:
Table: tempy.dbf
Type: dBase III Plus
Codepage: ascii (plain ol ascii)
Status: DbfStatus.CLOSED
Last updated: 2019-07-26
Record count: 1
Field count: 2
Record length: 31
--Fields--
0) name C(20)
1) desc M
The important line being the Codepage line -- it sounds like that is not properly set for your DBF file. If you know what it should be, you can either open it with that codepage (temporarily) with:
table = dbf.Table('file.DBF', codepage='...')
Or you can change it permanently (updates the DBF file) with:
table.open()
table.codepage = dbf.CodePage('cp1252') # for example
table.close()
from simpledbf import Dbf5
dbf2 = Dbf5('/Users/.../TCAT_MUNICIPIOS.dbf', codec='latin')
df2 = dbf2.to_dataframe()
df2.head(3)
install library DBF
conda install DBF
from dbfread import DBF
db_in_dbf = DBF('paht/database.dbf) this line uplodad the database
df = pd.DataFrame(db_in_dbf ) this line converts a dataframe of pandas
For all those who helped me on this issue for myself where I had to fix a corrupt .dbf file (so came from a .dbf and had to be returned to a .dbf). My particular issue was dates throughout the .dbf were... just very wrong... and tried and failed via many methods, with many errors, to crack and reassemble it... before succeeding with the below:
#Modify dbase3 file to recast null date fields as a default date and
#reimport back into dbase3 file
import collections
import datetime
from typing import OrderedDict
import dbf as dbf1
from simpledbf import Dbf5
from dbfread import DBF, FieldParser
import pandas as pd
import numpy as np
#Default date to overwrite NaN values
blank_date = datetime.date(1900, 1, 1)
#Read in dbase file from Old Path and point to new Path
old_path = r"C:\...\ex.dbf"
new_path = r"C:\...\newex.dbf"
#Establish 1st rule for resolving corrupted dates
class MyFieldParser(FieldParser):
def parse(self, field, data):
try:
return FieldParser.parse(self, field, data)
except ValueError:
return blank_date
#Collect the original .DBF data while stepping over any errors
table = DBF(old_path, None, True, False, MyFieldParser, collections.OrderedDict, False, False, False,'ignore')
#Grab the Header Name, Old School Variable Format, and number of characters/length for each variable
dbfh = Dbf5(old_path, codec='utf-8')
headers = dbfh.fields
hdct = {x[0]: x[1:] for x in headers}
hdct.pop('DeletionFlag')
keys = hdct.keys()
#Position of Type and Length relative to field name
ftype = 0
characters = 1
# Reformat and join all old school DBF Header fields in required format
fields = list()
for key in keys:
ftemp = hdct.get(key)
k1 = str(key)
res1 = ftemp[ftype]
res2 = ftemp[characters]
if k1 == "decimal_field_name":
fields.append(k1 + " " + res1 + "(" + str(res2) + ",2)")
elif res1 == 'N':
fields.append(k1 + " " + res1 + "(" + str(res2) + ",0)")
elif res1 == 'D':
fields.append(k1 + " " + res1)
elif res1 == 'L':
fields.append(k1 + " " + res1)
else:
fields.append(k1 + " " + res1 + "(" + str(res2) + ")")
addfields = '; '.join(str(f) for f in fields)
#load the records of the.dbf into a dataframe
df = pd.DataFrame(iter(table))
#go ham reformatting date fields to ensure they are in the correct format
df['DATE_FIELD1'] = df['DATE_FIELD1'].replace(np.nan, blank_date)
df['DATE_FIELD1'] = pd.to_datetime(df['DATE_FIELD1'])
# eliminate further errors in the dataframe
df = df.fillna('0')
#drop added "record index" field from dataframe
df.set_index('existing_primary_key', inplace=False)
#initialize defaulttdict and convert the dataframe into a .DBF appendable format
dd = collections.defaultdict(list)
records = df.to_dict('records',into=dd)
#create the new .DBF file
new_table = dbf1.Table(new_path, addfields)
#append the dataframe to the new .DBF file
new_table.open(mode=dbf1.READ_WRITE)
for record in records:
new_table.append(record)
new_table.close()

Pandas Generate Multiple xlsx file from CSV

I'm trying to generate multiple excel files from a single CSV file, but after generating few files getting below error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 8: ordinal not in range(128)
the error is coming after generating few files, I'm not sure if any specific with file or any issue in code, kindly help
the code is as below:
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import pandas.io.formats.excel
pandas.io.formats.excel.header_style = None
class AdvertiserList(object):
def __init__(self, input_file):
self.input_file = input_file
self.file_csv = None
self.writer = None
self.path = None
def read_csv(self):
file_csv = pd.read_csv(self.input_file)
file_csv_br = file_csv[file_csv['Market'] == 'BR']
file_csv = file_csv.drop(file_csv_br.index, axis=0)
self.file_csv = file_csv
def generate_multiple_file(self):
df_by_market = self.file_csv.groupby('Market')
self.path = "C://Adops-Git//Files//"
for(market, market_df) in df_by_market:
self.writer = pd.ExcelWriter(self.path + "{}.xlsx".format(market), engine="xlsxwriter")
# file_name = self.writer
market_df.to_excel(self.writer, index=False)
self.writer.save()
self.writer.close()
def main(self):
self.read_csv()
self.generate_multiple_file()
if __name__ == "__main__":
object_advertiser = AdvertiserList('C://Adops-Git//Files//Account_&_Advertisers_List_data.csv')
object_advertiser.main()
just try
market_df.to_excel(self.path + "{}.xlsx".format(market), index=False)
directly and use the default xlsx writer that supports unicode natively
also, from the comments you may be using an outdated version of pandas, if you don't have the encoding argument option.

Loading a json file in python

I've got multiple file to load as JSON, they are all formatted the same way but for one of them I can't load it without raising an exception. This is where you can find the file:
File
I did the following code:
def from_seed_data_extract_summoners():
summonerIds = set()
for i in range(1,11):
file_name = 'data/matches%s.json' % i
print file_name
with open(file_name) as data_file:
data = json.load(data_file)
for match in data['matches']:
for summoner in match['participantIdentities']:
summonerIds.add(summoner['player']['summonerId'])
return summonerIds
The error occurs when I do the following: json.load(data_file). I suppose there is a special character but I can't find it and don't know how to replace it. The error generated is:
UnicodeDecodeError: 'utf8' codec can't decode byte 0xeb in position 6: invalid continuation byte
Do you know how I can get ride of it?
Your JSON is trying to force the data into unicode, not just a simple string. You've got some embedded character (probably a space or something not very noticable) that is not able to be forced into unicode.
How to get string objects instead of Unicode ones from JSON in Python?
That is a great thread about making JSON objects more manageable in python.
replace file_name = 'data/matches%s.json' % i with file_name = 'data/matches%i.json' % i
the right syntax is data = json.load(file_name) and not -
with open(file_name) as data_file:
data = json.load(data_file)
EDIT:
def from_seed_data_extract_summoners():
summonerIds = set()
for i in range(1,11):
file_name = 'data/matches%i.json' % i
with open(file_path) as f:
data = json.load(f, encoding='utf-8')
for match in data['matches']:
for summoner in match['participantIdentities']:
summonerIds.add(summoner['player']['summonerId'])
return summonerIds
Try:
json.loads(unicode(data_file.read(), errors='ignore'))
or :
json.loads(unidecode.unidecode(unicode(data_file.read(), errors='ignore')))
(for the second, you would need to install unidecode)
try :
json.loads(data_file.read(), encoding='utf-8')

Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte

I have a problem converting nested JSON to CSV. For this i use https://github.com/vinay20045/json-to-csv (forked a bit to support python 3.4), here is full json-to-csv.py file.
Converting is working, if i set
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
and
fp = open(json_file_path, 'r', encoding='utf-8')
but when i import csv to MS Excel i see bad cyrillic characters, for example \xe0\xf1 , english text is ok.
Experimented with setting encode('cp1251','ignore') but then i got an error
UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to (as here UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
How to convert cyrillic correctly and also i want to skip bad characters?
You need to know cyrylic encoding of which file are you going to open.
For example that is enough in python3:
with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
In python3 data variable is automatically utf-8. In python2 there might be problem with feeding input to json.
Also try to print out in python interpreter line and see if symbols are right. Without input file is hard to tell if everything is right. Also are you sure that it is python, not excel related problem? Did you tried to open in notepad++ or similar encodings respecting editors?
Most important thing working with encodings is cheking that input and output is right. I would suggest to look here.
maybe you could use the chardet to detect the file's encoding.
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
This avoids 'to kick' the encoding.

Categories