Creating Multiple output files using pandas in python - python

The following code check SampleData.txt and produces Result1.txt. I want to create another file Result2.txt from same data that will contain only 1 column. I am new to pandas and cant figure out what is needed to be modified to create Result2.txt
import pandas as pd
from tabulate import tabulate
dl = []
with open('SampleData.txt', encoding='utf8', errors='ignore') as f:
for line in f:
parts = line.split()
if not parts[3][:2].startswith('($'):
parts.insert(3,'0')
if len(parts) > 5:
temp = ' '.join(parts[4:])
parts = parts[:4] + [temp]
parts[1] = int(parts[1])
parts[2] = float(parts[2].replace(',', ''))
parts[3] = float(parts[3].strip('($)').replace(',', ''))
dl.append(parts)
headers = ['ID', 'TRANS', 'VALUE', 'AMOUNT', 'CODE']
df = pd.DataFrame(dl,columns=headers)
pd.set_option('colheader_justify', 'center')
df = df.groupby(['ID','CODE']).sum().reset_index().round(2)
df = df.sort_values('TRANS',ascending=False)
df['AMOUNT'] = '($' + df['AMOUNT'].astype(str) + ')'
df = df[headers]
print (df.head(n=40).to_string(index=False))
print()
df.to_csv("Out1.txt", sep="\t", index=None, header=None)
SampleData.txt
0xdata1 1 2,200,000 test1(test1)
0xdata2 1 9,500,000,000 ($70.30) test2(test2)
0xdata3 1 4.6 ($14.08) test3(test3)
0xdata4 1 0.24632941 test4(test4)
0xdata5 1 880,000,000 ($1.94) test5(test5)
Result1.txt #-- Fine and working
0xdata1 1 2,200,000 test1(test1)
0xdata2 1 9,500,000,000 ($70.30) test2(test2)
0xdata3 1 4.6 ($14.08) test3(test3)
0xdata4 1 0.24632941 test4(test4)
0xdata5 1 880,000,000 ($1.94) test5(test5)
Result2.txt #-- Additional output needed and what I am trying to produce
0xdata1
0xdata2
0xdata3
0xdata4
0xdata5

You can select just the column that you want to save as in you case
df['ID'].to_csv("Out_ID.txt", sep="\t", index=None, header=None)
This should solve your problem!

Related

Python: How to treat CSV with text delimiter at the beginning and end of the line?

I need the reading of the files to derive to the dataframe, but I didn't find a solution to handle this file format:
Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements +/-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
this is the code i'm using
import os
import glob
import pandas as pd
path = r'\\Srvflssp03\gto\Planejamento_Operacional\Forecast\2021\11 Novembro\Relatórios MIS'
file_list = glob.glob(os.path.join(path, '**/*.csv'), recursive=True)
combined_csv = pd.concat([pd.read_csv(f,quotechar = '"', sep = ',') for f in file_list ])
DataFrame = pd.DataFrame(combined_csv)
DataFrame.head(11)
Could anyone tell me what I can do to fix this?
You could read it as normal text, repair line with " ", write all back to file as text, and later read all with pd.read_csv() without problems.
If you already read it then you may try to repair it. You can get text from first column Date and use io.StringIO to read it as csv to another DataFrame - and then you can copy row back to original place.
text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements +/-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''
import pandas as pd
import io
# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)
# get string from first column (`Date`) in row `0`
line = df.iloc[0]['Date']
print(line)
# convert to new dataframe
df2 = pd.read_csv(io.StringIO(line), header=None)
print(df2)
# copy back to original dataframe
df.iloc[0] = df2.iloc[0]
print(df)
Result:
# df - before changes
Date ... Staff - Req
0 11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.0... ... NaN
[1 rows x 24 columns]
# string `line`
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00
# df2
0 1 2 3 4 ... 19 20 21 22 23
0 11/1/21 2559 AUTORIZACOES BDC 2:30 AM 1.54 ... 0.0 0.0 5.0 0.0 0.0
[1 rows x 24 columns]
# df - after changes
Date Entity ID Entity Name ... Sched Open Staff Est. Staff - Req
0 11/1/21 2559.0 AUTORIZACOES BDC ... 5.0 0.0 0.0
[1 rows x 24 columns]
For your full data it will need to use row 10
line = df.iloc[10]['Date']
# ...
df.iloc[10] = df2.iloc[0] # `df2` uses `0`
EDIT:
Version with more rows and with for-loop
text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements +/-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''
import pandas as pd
import io
# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)
for index, row in df.iterrows():
# get string from first column (`Date`) in row
line = row['Date']
if ',' in line:
#print(line)
# convert to new dataframe
temp_df = pd.read_csv(io.StringIO(line), header=None)
#print(temp_df)
# copy back to original dataframe
df.iloc[index] = temp_df.iloc[0]
print(df)
and the same with apply() instead of for-loop
text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements +/-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''
import pandas as pd
import io
# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)
def convert(row):
# get string from first column (`Date`) in row
line = row['Date']
if ',' in line:
# convert to new dataframe
temp_df = pd.read_csv(io.StringIO(line), header=None, names=df.columns)
#print(temp_df)
# copy back to original dataframe
row = temp_df.iloc[0]
return row
df = df.apply(convert, axis=1)
print(df)

Dummy if the same value occurs on a later date in python (pandas)

I have a df that contains, among other columns, for each name several consecutive dates (yyyy-mm-dd) on which the name occurred. I want to create a dummy variable in a new column Rep that indicates whether the same name appears again on a later date.
I thought about looping through the two columns Name and Date in such a way that for each name with the youngest date a 0 is set while for all others a 1.
Additionally, I tried to use duplicated but as there are multiple occurences of the same Name on the same Date, this method does not provide the targeted output.
df:
Name Date
A 2006-01-01
B 2006-01-02
A 2006-01-04
A 2006-01-04
B 2006-01-08
outcome df:
Name Date Rep
A 2006-01-01 1
B 2006-01-02 1
A 2006-01-04 0
A 2006-01-04 0
B 2006-01-08 0
Code with duplicated method:
df = df(by=["Name", "Date"])
df["Rep"] = df.duplicated(subset=["Name", "Date"], keep = "last")
Achieved outcome:
Name Date Rep
A 2006-01-01 1
B 2006-01-02 1
A 2006-01-04 1 # this should be 0!
A 2006-01-04 0
B 2006-01-08 0
As required, a sample of one of the csv files:
Name;Date;Name_Parent;Amount_Est
A;2006-01-01;3;646,200.00
B;2006-01-02;2;25,000,000.00
A;2006-01-04;3;18,759,000.00
A;2006-01-04;5;18,759,000.00
C;2006-01-04;4;18,759,000.00
B;2006-01-08;6;945,000.00
C;2006-01-09;2;945,000.00
A;2006-01-10;4;945,000.00
To create df, i used pandas.
As i have 40 singular csv files, i used a loop:
import pandas as pd
import glob2 as glob
# import and merge data
path = r'/Users/...'
all = glob.glob(path + "/*.csv")
l = []
for f in all:
df1 = pd.read_csv(f, sep =";", index_col = None, header = 0)
df1 = df1.drop(df1.index[0])
l.append(df1)
df = pd.concat(l, axis = 0)
del f, all, df1, l, path
Thanks for your help!
Here is the sample output:
Here is the code:
import pandas as pd
import glob
from shutil import copyfile
import os
def file_len(fname):
with open(fname) as fp:
for i, line in enumerate(fp):
pass
return i + 1
def read_nth(fname,intNth):
with open(fname) as fp:
for i, line in enumerate(fp):
if i == (intNth-1):
return line
def showRepetitions(fname):
temp8 = []
temp3 = []
for temp1 in range(file_len(fname),-1,-1):
if "Name;Date;Name_Parent;Amount_Est" in read_nth(fname,temp1):
temp8.append("Name;Date;Name_Parent;Amount_Est;Repeats_X_More_Times\n")
break
temp2 = read_nth(fname,temp1)
temp8.append(temp2.strip()+";"+str(temp3.count(temp2.split(";")[0]))+"\n")
temp3.append(temp2.split(";")[0])
f = open(fname, "w")
for temp9 in reversed(temp8):
f.write(temp9)
f.close()
path = r'C:\Users\USERname4\Desktop'
all = glob.glob(path + r"\*.csv")
l = []
for f in all:
f2 = f[:-3]+"txt"
copyfile(f, f2)
showRepetitions(f2)
df1 = pd.read_csv(f2, sep =";", index_col = None, header = 0)
os.remove(f2)
l.append(df1)
df = pd.concat(l, axis = 0)
print(df)
Solved the problem. Perhaps this will help someone in the future:
In a new df, df_max, I extracted from df all names with the most recent date since there are no further entries after the respective most recent date (dummy = 0) . Then, I kept only the relevant columns in df_max that will be used for the merger. Next, in a new column Rep, I set each value to 0. After merging the two dfs df and df_max to df_new on the columns Name and Date, all most recent entries, regardless how often the Name and Date combination occurs, are filled with 0 on Rep. Lastly, I filled the nan of Rep with 1.
df = df.sort_values(by=["Name", "Date"])
df_max = pd.DataFrame(df.sort_values("Date").groupby("Name").last().reset_index())
df_max = df_max[["Name", "Date"]]
df_max["Rep"] = "0"
df_new = pd.merge(df, df_max, how="left", left_on=["Name", "Date"], right_on = ["Name", "Date"])
df_startups_new = df_startups_new.fillna(1)

How to create new csv from list of csv's in dataframe

So I know my code isn't that close to right, but I am trying to loop through a list of csv's, line by line, to create a new csv where each line will list all csv's that met a condition. First column in all csv's is "date", I want to list the name of all csv's where data["entry"] > 3 on that date with date still being the 1st column.
Update: What I'm trying to do is for each csv, make a new list of each date the condition was met and on those days on the new csv append file_name to that row/rows.
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/SentdexTutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/SentdexTutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
complete_string = ' is complete'
listdrs_confirmation = [ x + complete_string for x in listdrs]
#print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file_path in listdrs_path:
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
##print(listdr)
# Convert date to timestamp and make index
data.index = data["date"].apply(lambda x: pd.Timestamp(x))
data.drop("date", axis=1, inplace=True)
return data
##create new table and append data
data = data[data.Entry > 3]
for date in data.date:
new_table[date].append(file_path)
new_table_data = data.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())], columns=['date', 'table names'])
print(new_table_data)
I would do something like this. You need to modify the following snippet according to your needs.
import pandas as pd
from glob import glob
from collections import defaultdict
# create and save some random data
df1 = pd.DataFrame({'date':[1,2,3], 'entry':[4,3,2]})
df2 = pd.DataFrame({'date':[1,2,3], 'entry':[1,2,4]})
df3 = pd.DataFrame({'date':[1,2,3], 'entry':[3,1,5]})
df1.to_csv('table1.csv')
df2.to_csv('table2.csv')
df3.to_csv('table3.csv')
# read all the csv
tables = glob('*.csv')
new_table = defaultdict(list)
# create new table
for table in tables:
df = pd.read_csv(table)
df = df[df.entry > 2]
for date in df.date:
new_table[date].append(table)
new_table_df = pd.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())], columns=['date', 'table names'])
print (new_table_df)
date table names
0 1 table3.csv,table1.csv
1 2 table1.csv
2 3 table2.csv,table3.csv
Had some issues with the other code, here is the final solution I was able to come up with.
if 'Entry' in data:
##create new table and append data
data = data[data.Entry > 3]
if 'date' in data:
for date in data.date:
if date not in new_table:
new_table[date] = []
new_table[date].append(
pd.DataFrame({'FileName': [file_name], 'Entry': [int(data[data.date == date].Entry)]}))
new_table
elif 'Date' in data:
for date in data.Date:
if date not in new_table:
new_table[date] = []
new_table[date].append(
pd.DataFrame({'FileName': [file_name], 'Entry': [int(data[data.Date == date].Entry)]}))
# sorted(new_table, key=lambda x: x[0])
def find_max(tbl):
new_table_data = {}
for date in sorted(tbl.keys()):
merged_dt = pd.concat(tbl[date])
max_entry_v = max(list(merged_dt.Entry))
tbl_names = list(merged_dt[merged_dt.Entry == max_entry_v].FileName)
new_table_data[date] = tbl_names
return new_table_data
new_table_data = find_max(tbl=new_table)
#df = pd.DataFrame(new_table, columns =['date', 'tickers'])
#df.to_csv(input_path, index = False, header = True)
# find_max(new_table)
# new_table_data = pd.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())],
# columns=['date', 'table names'])
print(new_table_data)

Concatenating .mtx files and changing counter for cell IDs

I have several files that look like this, where the header is the count of unique values per column.
How can I read several of these files and concatenate them all in one??
When I concatenate, I need that all the values in the column in the middle ADD the total value of count of that column from the file before, to continue with the count when I concatenate. The other two columns I don't mind.
My try:
matrixFiles = glob.glob(filesPath +'/*matrix.mtx')
dfs = []
i = 0
for file in sorted(matrixFiles):
matrix = pd.read_csv(file, sep = ' ')
cellNumber = matrix.columns[1]
cellNumberInt = np.int64(cellNumber)
if i > 0:
matrix.iloc[:,1] = matrix.iloc[:,1] + cellNumberInt
dfs.append(matrix)
i = i + 1
big_file = pd.concat (dfs)
I don't know how to access to cellNumberInt from the file iterated before to add it to the new one.
When I concat dfs the output is not a three column dataframe. How can I concatenate all the files in the same columns and avoiding the header?
1.csv:
33694,1298,2465341
33665,1299,20
33663,1299,8
2.csv:
53694,1398,3465341
33665,1399,20
33663,1399,8
3.csv:
13694,7778,3465341
44432,7780,20
33663,7780,8
import pandas as pd
import numpy as np
matrixFiles = ['1.csv', '2.csv', '3.csv']
dfs = []
matrix_list = []
#this dict stores the i number (keys) and the cellNumberInt (values)
cellNumberInt_dict = {}
i = 0
for file in sorted(matrixFiles):
matrix = pd.read_csv(file)
cellNumber = matrix.columns[1]
cellNumberInt = np.int64(cellNumber)
cellNumberInt_dict[i] = cellNumberInt
if i > 0:
matrix.rename(columns={str(cellNumberInt) : cellNumberInt + cellNumberInt_dict[i-1]}, inplace=True)
dfs.append(matrix)
if i < len(matrixFiles)-1:
#we only want to keep the df values here, keeping the columns that don't
# have shared names messes up the pd.concat()
matrix_list.append(matrix.values)
i += 1
# get the last df in the dfs list because it has the last cellNumberInt
last_df = dfs[-1]
#concat all of the values from the dfs except for the last one
arrs = np.concatenate(matrix_list)
#make a df from the numpy arrays
new_df = pd.DataFrame(arrs, columns=last_df.columns.tolist())
big_file = pd.concat([last_df, new_df])
big_file.rename(columns={big_file.columns.tolist()[1] : sum(cellNumberInt_dict.values())}, inplace=True)
print (big_file)
13694 10474 3465341
0 44432 7780 20
1 33663 7780 8
0 33665 1299 20
1 33663 1299 8
2 33665 1399 20
3 33663 1399 8

Python - How to improve the dataframe performance?

There are 2 CSV files. Each file has 700,000 rows.
I should read one file line by line and find the same row from the other file.
After then, make two files data as one file data.
But, It takes about 1 minute just per 1,000 rows!!
I don't know how to improve the performance.
Here is my code :
import pandas as pd
fail_count = 0
match_count = 0
count = 0
file1_df = pd.read_csv("Data1.csv", sep='\t')
file2_df = pd.read_csv("Data2.csv", sep='\t')
columns = ['Name', 'Age', 'Value_file1', 'Value_file2']
result_df = pd.DataFrame(columns=columns)
for row in fil1_df.itterow():
name = row[1][2]
position = row[1][3]
selected = file2_df[(file2_df['Name'] == name ) & (file2_df['Age'] == age)]
if selected.empty :
fail_count += 1
continue
value_file1 = row[1][4]
value_file2 = selected['Value'].values[0]
result_df.loc[len(result_df)] = [name, age, value_file1, value_file2]
match_count += 1
print('match : ' + str(match_count))
print('fail : ' + str(fail_count))
result_df.to_csv('result.csv', index=False, encoding='utf-8')
Which line can be changed?
Is there any other way to do this process?
This might be too simplistic, but have you tried using pandas.merge() functionality?
See here for syntax.
For your tables:
result_df = pd.merge(left=file1_df, right=file2_df, on=['Name', 'Age'], how='inner')
That will do an "inner" join, only keeping rows with Names & Ages that match in both tables.

Categories