Cannot convert <function array_str at 0x02F1E978> to Excel - python

I keep getting the error above even though I tried to convert everything to object or string.
df['temp'] = df['Date'].apply(lambda x: x.strftime('%m/%d/%Y'))
nd = df['Date'].unique() nd = np.array_str
I want to get the unique value in column Date of df to be the column header. I want this value to show as MM/DD/YYYY. The result in Python appears as "0x02F1E978". It should have been 09/25/2017 and I can write the file to Excel at all.
import pandas as pd
import numpy as np
from datetime import date, datetime
path = 'C:/Users/tnguy075/Desktop/Inventory Valuation/'
file1 = 'AH_INDY_COMBINEDINV_VALUE_TIDL.xlsx'
file2 = 'DailyInventoryVal.xlsx'
df = pd.read_excel(path+file1, skiprows=1, dtype={'Valuation': np.float64}, parse_dates=['Date']) #open the daily data
df['temp'] = df['Date'].apply(lambda x: x.strftime('%m/%d/%Y'))#change it to a string in form of MM/DD/YYYY
nd = df['Date'].unique()
nd = np.array_str
df = pd.pivot_table(df,index=["Key"], values=["Valuation"],aggfunc=np.sum).reset_index()
df.columns = ['Key',nd]
dv = pd.read_excel(path+file2)
dv = dv.merge(df[['Key', nd]], how = 'left')#Merge data from file 1 using Key
dv.to_excel('path+DaivalReport.xlsx')

Related

Read json with meaningless keys into pandas data.frame with correct Dtype

In a project, I receive json that I need to read into a pandas data.frame.
The format looks like the one below (with more columns and rows):
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
I'm able to split the strings, but my types are not what I want. Is there an automated way to convert the columns in u?
from io import StringIO
import pandas as pd
TESTDATA = StringIO("""
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
""")
df = pd.read_json(TESTDATA)
df.head(10)
vnames = df.columns[0].split(';')
u = (df[df.columns[0]].str.split(';', expand=True)
.set_axis(vnames, axis=1, inplace=False)).convert_dtypes()
print(u.head(10))
print(u.info())
I want the Dtype to be int64, datetime64, float64, str.
You could do the following:
from io import StringIO
import pandas as pd
import numpy as np
TESTDATA = StringIO("""
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
""")
df = pd.read_json(TESTDATA)
df.head(10)
vnames = df.columns[0].split(';')
u = (df[df.columns[0]].str.split(';', expand=True)
.set_axis(vnames, axis=1, inplace=False))
u = u.apply(lambda x: x.str.strip()).replace('NA', np.nan)
u = u.to_json()
u = pd.read_json(u).convert_dtypes()
print(u.head(10))
print(u.info())
Try explicitly typecasting the string values before creating the DataFrame, like in this example:
import json
import pandas as pd
s_src = '''{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}'''
s = json.loads(s_src)
# per-column type conversion
typeconv = [int, pd.to_datetime, float, str]
for k1, subd in s.items():
cols = k1.split(';')
rows = []
for k, v in subd.items():
row = v.split(';')
conv_row =[]
for cvt, r in zip(typeconv, row):
# screen for missing values
if r == 'NA':
conv_row.append(None)
else:
# apply the conversion function for this column
conv_row.append(cvt(r))
rows.append(conv_row)
df = pd.DataFrame(rows, columns=cols)

Formatting of JSON file

Can we convert the highlighted INTEGER values to STRING value (refer below link)?
https://i.stack.imgur.com/3JbLQ.png
CODE
filename = "newsample2.csv"
jsonFileName = "myjson2.json"
import pandas as pd
df = pd.read_csv ('newsample2.csv')
df.to_json('myjson2.json', indent=4)
print(df)
Try doing something like this.
import pandas as pd
filename = "newsample2.csv"
jsonFileName = "myjson2.json"
df = pd.read_csv ('newsample2.csv')
df['index'] = df.index
df.to_json('myjson2.json', indent=4)
print(df)
This will take indices of your data and store them in the index column, so they will become a part of your data.

Python: how do I get the date from this date range 08/03/2020 to 08/03/2020 (excel equivalent of left( 08/03/2020 to 08/03/2020, 10))

I have over 100 CSV files, whose date is saved in the following format: 08/03/2020 to 08/03/2020
I have the following code, but before append (or after) option I would like to fix date format (I only need first part), delete duplicate dates and sort dates chronologically. This is excel equivalent: left(cell,10)
import pandas as pd
import glob
import os
path = 'C:\\Users\\test\\Desktop\\AA2020\\files\\'
# Change directory
os.chdir(path)
# Define dataframe that will append the data to
df = pd.DataFrame()
for file in glob.glob('*.csv'):
df_tmp = pd.read_csv(path + file, skiprows=range(1, 10))
df_tmp.columns = ['value']
date = df_tmp.loc['Date', 'value']
count = df_tmp.loc['Results Found', 'value']
df = df.append(pd.DataFrame([[date, count]], columns=['date', 'count']))
df.to_excel(path + "results\\count.xlsx")

Pandas DataFrame - KeyError: 'date'

For a current project, I am working with a large Pandas DataFrame sourced from a JSON file.
As soon as calling specific objects of the JSON file within Pandas, I am getting key errors such as KeyError: 'date' for line df['date'] = pd.to_datetime(df['date']).
I have already excluded the identifier/object wording as a possible source for the error. Is there any smart tweak to make this code work?
The JSON file has the following structure:
[
{"stock_symbol": "AMG", "date": "2013-01-01", "txt_main": "ABC"}
]
And the corresponding code section looks like this:
import string
import json
import pandas as pd
# Loading and normalising the input file
file = open("sp500.json", "r")
data = json.load(file)
df = pd.json_normalize(data)
df = pd.DataFrame().fillna("")
# Datetime conversion
df['date'] = pd.to_datetime(df['date'])
Take a look at the documentation examples of fillna function fillna function.
By doing df = pd.DataFrame().fillna("") you are overriding your previous df with a new (empty) dataframe. You can just apply it this way: df = df.fillna("").
In [42]: import string
...: import json
...: import pandas as pd
...:
...: # Loading and normalising the input file
...: #file = open("sp500.json", "r")
...: #data = json.load(file)
...: df = pd.json_normalize(a)
...: #df = pd.DataFrame().fillna("")
...:
...: # Datetime conversion
...: df['date'] = pd.to_datetime(df['date'])
In [43]: df
Out[43]:
stock_symbol date txt_main
0 AMG 2013-01-01 ABC
df = pd.DataFrame().fillna("") creates a new empty dataframe and fills "NaN" with empty.
So, change that line to df = df.fillna("")
You are using df = pd.DataFrame().fillna("") which will create a new dataframe and fill an with no value.
Here the old df is replaced by empty dataframe, so there is no column named date. Instead, you can use to fill 'na' values using df.fillna("").
import string
import json
import pandas as pd
# Loading and normalising the input file
file = open("sp500.json", "r")
data = json.load(file)
df = pd.json_normalize(data)
df = df.fillna("")
# Datetime conversion
df['date'] = pd.to_datetime(df['date'])
Thank you

Pandas reading CSVs and filtering dataframe based on filedate

I'm trying to read a bunch of CSV-files into a single pandas dataframe. Some of the CSVs have data for multiple dates. I want only the data from each CSV that has a date equal to the modification date of each file.
Here is my current attempt:
import os
import datetime
import pandas as pd
from pandas import Series, DataFrame
import glob as glob
path =r'C:xxx'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d')
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
df["DATE"] = pd.to_datetime(df["DATE"], format='%Y-%m-%d')
filedate = modification_date(allFiles)
df = df[(df["DATE"] == filedate)]
list_.append(df)
frame = pd.concat(list_)
frame.reset_index(inplace=True, drop=True)
This fails because the loop here creates a list of modification dates (since the folder contains many CSV's) that the function modification_date can't handle. Error is: "TypeError: coercing to Unicode: need string or buffer, list found"
I'm trying to wrap my head around how to modify this so each CSV is evaluated separately but can't seem to get far.
I would do it this way:
import os
import glob
import pandas as pd
fmask = 'C:/Temp/.data/aaa/*.csv'
all_files = glob.glob(fmask)
# returns file's modification date (the time part will be truncated)
def get_mdate(filename):
return (pd.to_datetime(os.path.getmtime(filename), unit='s')
.replace(hour=0, minute=0, second=0, microsecond=0))
df = pd.concat([pd.read_csv(f, parse_dates=['DATE'])
.query('DATE == #get_mdate(#f)')
for f in all_files
],
ignore_index=True)
Test:
1.csv: # modification date: 2016-07-07
DATE,VAL
2016-07-06,10
2016-07-06,10
2016-07-05,10
2016-07-07,110
2016-07-07,110
2.csv: # modification date: 2016-07-05
DATE,VAL
2016-07-06,1
2016-07-06,1
2016-07-05,1
2016-07-07,11
2016-07-07,11
Result:
In [208]: %paste
df = pd.concat([pd.read_csv(f, parse_dates=['DATE'])
.query('DATE == #get_mdate(#f)')
for f in all_files
],
ignore_index=True)
## -- End pasted text --
In [209]: df
Out[209]:
DATE VAL
0 2016-07-07 110
1 2016-07-07 110
2 2016-07-05 1

Categories