converting the contents of txt file to columns of pandas dataframe - python

I have a .txt file of this sort
12
21
23
1
23
42
12
0
In which <12,21,23> are features and <1> is a label.
Again <23,42,12> are features and <0> is the label and so on.
I want to create a pandas dataframe from the above text file which contains only a single column into multiple column.
The format of the dataframe is {column1,column2,column3,column4}. And there are no column names in it.
Can someone please help me out in this?
Thanks

import pandas as pd
df = dict()
features = list()
label = ''
filename = '.txt'
with open(filename) as fd:
i = 0
for line in fd:
if i != 3:
features.append(line.strip())
i += 1
else:
label = line.strip()
i = 0
df[label] = features
features = list()
df = pd.DataFrame(df)
df

import pandas as pd
with open(<FILEPATH>, "r") as f:
lines = f.readlines()
formatted = [int(line[:-1]) for line in lines] # Remove \n and convert to int
labels = formatted[3::4]
features = list(zip(formatted[::4], formatted[1::4], formatted[2::4])) # You can modify this if there are more than three rows
data = {}
for i, label in enumerate(labels):
data[label] = list(features[i])
df = pd.DataFrame(data)
Comment if you have any questions or found any errors, and I will make ammendments.

You can use numpy first, you need to ensure that the number of values is a multiple of 4
each record as column with the label as header
a = np.loadtxt('file.txt').reshape((4,-1), order='F')
df = pd.DataFrame(a[:-1], columns=a[-1])
Output:
1.0 0.0
0 12.0 23.0
1 21.0 42.0
2 23.0 12.0
each record as a new row
a = np.loadtxt('file.txt').reshape((-1,4))
df = pd.DataFrame(a)
Output:
0 1 2 3
0 12.0 21.0 23.0 1.0
1 23.0 42.0 12.0 0.0

row = []
i = 0
data = []
with open('a.txt') as f:
for line in f:
data
i+= 1
row.append(int(line.strip()))
if i%4==0 and i!=0:
print(i)
data_rows_count +=1
data.append(row)
row = []
f.close()
df = pd.DataFrame(data)
results in df to be:
0 1 2 3
0 12 21 23 1
1 23 42 12 0

Related

Pandas returning empty dataframe after merge

I'm attempting to merge multiple sets of word data. Each csv file that is read in (there is 4 files) contains a column for each unique word in a book, and a column for how many times that word shows up. Whats supposed to happen is the word columns of all of these csv files are supposed to merge into one in this new matrix file I'm trying to create, but when I attempt to merge each csv file and its data, an empty data frame is returned.
The csv files are like:
Word Count
Thou 100
O 20
Hither 8
and I want them to merge like this:
Word Book1 Book2 Book3
Thou 50 0 88
Hello 32 35 27
No 89 38 0
Yes 80 99 0
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
dataPath = 'data/'
fileNames = [f for f in listdir(dataPath) if isfile(join(dataPath, f))]
columns = [os.path.splitext(x)[0] for x in fileNames]
columns.remove('rows')
columns.remove('cols')
columns.remove('matrix')
columns.insert(0, "Word")
wordData = []
matrix = pd.DataFrame(columns=columns)
for file in fileNames:
if '.txt' in file:
continue
elif 'matrix' in file:
continue
else:
myFile = open(f"./data/{file}", "r")
readFile = myFile.read()
dataVector = pd.read_csv(f"./data/{file}", sep=",")
#print(dataVector)
matrix.merge(dataVector, how="outer", on=["Word"])
print(matrix)
myFile.close()
pd.set_option("display.max_rows", None, "display.max_columns", None)
matrix = matrix.fillna(0)
matrix.to_csv(path_or_buf="./data/matrix.csv")
I think this may be the thing you needed.
Data:
import pandas as pd
book_list = []
book_list.append(pd.DataFrame({'Word': ['a', 'b'], 'Count': [1, 2]}))
book_list.append(pd.DataFrame({'Word': ['b', 'c'], 'Count': [3, 4]}))
book_list.append(pd.DataFrame({'Word': ['d', 'e', 'f'], 'Count': [5, 6, 7]}))
book_list.append(pd.DataFrame({'Word': ['c', 'e'], 'Count': [8, 9]}))
Code:
result = None
for idx_book, book in enumerate(book_list):
if result is None:
result = book
else:
result = result.merge(book, how="outer", on=["Word"], suffixes=(idx_book-1, idx_book))
Result:
Word Count0 Count1 Count2 Count3
0 a 1.0 NaN NaN NaN
1 b 2.0 3.0 NaN NaN
2 c NaN 4.0 NaN 8.0
3 d NaN NaN 5.0 NaN
4 e NaN NaN 6.0 9.0
5 f NaN NaN 7.0 NaN
Ended up solving it by using this lambda function:
matrix = reduce(lambda left,right: pd.merge(left,right,on=['Word'],how='outer'), wordData).fillna(0)

Averaging out every four elements in a CSV file

I have a CSV file with, say $n=100$ elements. So the file looks like a $n$-dimensional vector. The question is: how can I average every 4 elements and save the results in a new csv file?
For example I generate a list of random numbers:
import random
my_random_list = []
for i in range(0,9):
n = random.randint(1,100)
my_random_list.append(n)
df = pd.DataFrame(my_random_list)
df.to_csv('my_csv.csv', index=False, header=None)
This is similar to my code. Now, I want create a new csv (because I have the data in csv form already) where I average out and save the first 4 elements, then the next 4, etc. So I will end up with a csv file with only 25 elements.
Use DataFrame.groupby with integer division of index for groups of 4 values and aggregate mean:
np.random.seed(2021)
df = pd.DataFrame({'a':np.random.randint(1,10, size=10)})
print (df)
a
0 5
1 6
2 1
3 7
4 6
5 9
6 7
7 7
8 7
9 7
df1 = df.groupby(df.index // 4).mean()
print (df1)
a
0 4.75
1 7.25
2 7.00
Detail:
print (df.index // 4)
Int64Index([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype='int64')
All together:
df = pd.read_csv(file, header=None)
df1 = df.groupby(df.index // 4).mean()
df1.to_csv('my_csv.csv', index=False, header=None)
import pandas as pd
import random
import csv
# FIRST PART -- GENERATES THE ORIGINAL CSV FILE
my_random_list = []
for i in range(0,100):
n = random.randint(1,100)
my_random_list.append(n)
df = pd.DataFrame(my_random_list)
df.to_csv('my_csv.csv', index=False, header=None)
# SECOND PART -- POPULATES A LIST WITH THE CONTENTS OF THE
# ORIGINAL CSV FILE
file_CSV = open('my_csv.csv')
data_CSV = csv.reader(file_CSV)
list_CSV = list(data_CSV)
# THIRD PART -- GENERATES A NEW LIST CONTAINING
# THE AVERAGE OF EVERY FOURTH ELEMENT
# AND ITS THREE PREDECESSORS
new_list = []
for i in range(0,len(list_CSV)):
if(i%4==0):
s = int(list_CSV[i+0][0])
s = s + int(list_CSV[i+1][0])
s = s + int(list_CSV[i+2][0])
s = s + int(list_CSV[i+3][0])
s = s/4
new_list.append(s)
i = i + 1
# FOURTH PART -- GENERATES A NEW CSV
df = pd.DataFrame(new_list)
df.to_csv('new_csv.csv', index=False, header=None)

Parsing array from txt file to Pandas dataframe in Python

Hi, I have such array in my .txt file:
n|vechicle.car.characteristics[0].speed|180
n|vechicle.car.characteristics[0].weight|3
c|vechicle.car.characteristics[0].color|black
c|vechicle.car.characteristics[0].fuel|95
n|vechicle.car.characteristics[1].speed|160
n|vechicle.car.characteristics[1].weight|4
c|vechicle.car.characteristics[1].color|green
c|vechicle.car.characteristics[1].fuel|92
n|vechicle.car.characteristics[2].speed|200
n|vechicle.car.characteristics[2].weight|5
c|vechicle.car.characteristics[2].color|white
c|vechicle.car.characteristics[2].fuel|95
And I'd like to parse it into such dataFrame:
speed weight color fuel
0 180 3 black 95
1 160 4 green 92
2 200 5 white 95
That's, how i solved it:
import re
import pandas as pd
df_output_list = {}
df_output_dict = []
match_counter = 1
with open('sample_car.txt',encoding='utf-8') as file:
line = file.readline()
while line:
result = re.split(r'\|',line.rstrip())
result2 = re.findall(r'.(?<=\[)(\d+)(?=\])',result[1])
regex = re.compile('vechicle.car.characteristics.')
match = re.search(regex, result[1])
if match:
if match_counter == 1:
ArrInd = 0
match_counter+=1
#print(df_output_list)
if ArrInd == int(result2[0]):
df_output_list[result[1].split('.')[3]] = result[2]
ArrInd = int(result2[0])
else:
df_output_dict.append(df_output_list)
df_output_list = {}
df_output_list[result[1].split('.')[3]] = result[2]
ArrInd = int(result2[0])
line = file.readline()
df_output_dict.append(df_output_list)
#print(df_output_dict)
df_output = pd.DataFrame(df_output_dict)
print(df_output)
And i found it so complicated. Is it possible to simplify it?
Column names should be parsed automatically.
Read as csv file with sep='|' then get last column which contain values and then reshape in appropriate shape.
>>> columns=['speed','weight','color','fuel']
>>> s = pd.read_csv('filename.txt', sep='|', header=None).iloc[:,-1]
>>> df = pd.DataFrame(s.to_numpy().reshape(-1,4), columns=columns)
>>> df
speed weight color fuel
0 180 3 black 95
1 160 4 green 92
2 200 5 white 95
If you have fix row formate like n|vechicle.car.characteristics[0].speed|180 then we can do this
>>> df = pd.read_csv('d.csv', sep='|', header=None)
>>> columns = df.iloc[:,1].str.split('.').str[-1].unique()
>>> df_out = pd.DataFrame(df.iloc[:,-1].to_numpy().reshape(-1,len(columns)), columns=columns)
>>> df_out
speed weight color fuel
0 180 3 black 95
1 160 4 green 92
2 200 5 white 95

Python: Pivot Table/group by specific conditions

I'm trying to change structure of my data from text file(.txt) which data look like this:
:1:A
:2:B
:3:C
:1:D
:2:E
:3:F
:4:G
:1:H
:3:I
:4:J
And I would like to transform them into this format (like pivot-table in excel which column name is character between ":" and each group always start with :1:)
Group :1: :2: :3: :4:
1 A B C
2 D E F G
3 H I J
Does anyone have any idea? Thanks in advance.
First create DataFrame by read_csv with header=None, because no header in file:
import pandas as pd
temp=u""":1:A
:2:B
:3:C
:1:D
:2:E
:3:F
:4:G
:1:H
:3:I
:4:J"""
#after testing replace 'pd.compat.StringIO(temp)' to 'filename.csv'
df = pd.read_csv(pd.compat.StringIO(temp), header=None)
print (df)
0
0 :1:A
1 :2:B
2 :3:C
3 :1:D
4 :2:E
5 :3:F
6 :4:G
7 :1:H
8 :3:I
9 :4:J
Extract original column by DataFrame.pop, then remove traling : by Series.str.strip and Series.str.split values to 2 new columns. Then create groups by compare with Series.eq for == by string 0 with Series.cumsum, create MultiIndex by DataFrame.set_index and last reshape by Series.unstack:
df[['a','b']] = df.pop(0).str.strip(':').str.split(':', expand=True)
df1 = df.set_index([df['a'].eq('1').cumsum(), 'a'])['b'].unstack(fill_value='')
print (df1)
a 1 2 3 4
a
1 A B C
2 D E F G
3 H I J
Use:
# Reading text file (assuming stored in CSV format, you can also use pd.read_fwf)
df = pd.read_csv('SO.csv', header=None)
# Splitting data into two columns
ndf = df.iloc[:, 0].str.split(':', expand=True).iloc[:, 1:]
# Grouping and creating a dataframe. Later dropping NaNs
res = ndf.groupby(1)[2].apply(pd.DataFrame).apply(lambda x: pd.Series(x.dropna().values))
# Post processing (optional)
res.columns = [':' + ndf[1].unique()[i] + ':' for i in range(ndf[1].nunique())]
res.index.name = 'Group'
res.index = range(1, res.shape[0] + 1)
res
Group :1: :2: :3: :4:
1 A B C
2 D E F G
3 H I J
Another way to do this:
#read the file
with open("t.txt") as f:
content = f.readlines()
#Create a dictionary and read each line from file to keep the column names (ex, :1:) as keys and rows(ex, A) as values in dictionary.
my_dict={}
for v in content:
key = v.rstrip(':')[0:3] # take the value ':1:'
value = v.rstrip(':')[3] # take value 'A'
my_dict.setdefault(key,[]).append(value)
#convert dictionary to dataframe and transpose it
df = pd.DataFrame.from_dict(my_dict,orient='index').transpose()
df
The output will be looking like this:
:1: :2: :3: :4:
0 A B C G
1 D E F J
2 H None I None

Checking Padded data in Pandas Dataframe on specific columns

I have a DataFrame that looks like this:
import numpy as np
raw_data = {'Series_Date':['2017-03-10','2017-03-13','2017-03-14','2017-03-15'],'SP':[35.6,56.7,41,41],'1M':[-7.8,56,56,-3.4],'3M':[24,-31,53,5]}
import pandas as pd
df = pd.DataFrame(raw_data,columns=['Series_Date','SP','1M','3M'])
print df
I would like to run a test on certain columns in this DataFrame only, all column names in this list:
check = {'1M','SP'}
print check
For these columns, I would like to know when the values in either of these columns is the same as the value on the previous day. So the output dataframe should return series date and a Comment such as (for the example in this case:)
output_data = {'Series_Date':['2017-03-14','2017-03-15'],'Comment':["Value for 1M data is same as previous day","Value for SP data is same as previous day"]}
output_data_df = pd.DataFrame(output_data,columns = ['Series_Date','Comment'])
print output_data_df
Could you please provide some assistance how to deal with this?
The following does more or less what you want.
Columns item_ok are added to the original dataframe specifying if the value is the same as previous day or not:
from datetime import timedelta
df['Date_diff'] = pd.to_datetime(df['Series_Date']).diff()
for item in check:
df[item+'_ok'] = (df[item].diff() == 0) & (df['Date_diff'] == timedelta(1))
df_output = df.loc[(df[[item + '_ok' for item in check]]).any(axis=1)]
I'm not sure it is the most clean way to do it. However, it works
check = {'1M', 'SP'}
prev_dict = {c: None for c in check}
def check_prev_value(row):
global prev_dict
msg = ""
# MAYBE add clause to check if both are equal
for column in check:
if row[column] == prev_dict[column]:
msg = 'Value for %s data is same as previous day' % column
prev_dict[column] = row[column]
return msg
df['comment'] = df.apply(check_prev_value, axis=1)
output_data_df = df[df['comment'] != ""]
output_data_df = output_data_df[["Series_Date", "comment"]].reset_index(drop=True)
For your input:
Series_Date SP 1M 3M
0 2017-03-10 35.6 -7.8 24
1 2017-03-13 56.7 56.0 -31
2 2017-03-14 41.0 56.0 53
3 2017-03-15 41.0 -3.4 5
The output is:
Series_Date comment
0 2017-03-14 Value for 1M data is same as previous day
1 2017-03-15 Value for SP data is same as previous day
Reference: this answer
cols = ['1M','SP']
for col in cols:
df[col + '_dup'] = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount()
Output column will have an integer greater than zero when a duplicate is found.
df:
Series_Date SP 1M 3M 1M_dup SP_dup
0 2017-03-10 35.6 -7.8 24 0 0
1 2017-03-13 56.7 56.0 -31 0 0
2 2017-03-14 41.0 56.0 53 1 0
3 2017-03-15 41.0 -3.4 5 0 1
Slice to find dups:
col = 'SP'
dup_df = df[df[col + '_dup'] > 0][['Series_Date', col + '_dup']]
dup_df:
Series_Date SP_dup
3 2017-03-15 1
Here is a function version of the above (with the added feature of handling multiple columns):
import pandas as pd
import numpy as np
def find_repeats(df, col_list, date_col='Series_Date'):
dummy_df = df[[date_col, *col_list]].copy()
dates = dummy_df[date_col]
date_series = []
code_series = []
if len(col_list) > 1:
for col in col_list:
these_repeats = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount().values
repeat_idx = list(np.where(these_repeats > 0)[0])
date_arr = dates.iloc[repeat_idx]
code_arr = [col] * len(date_arr)
date_series.extend(list(date_arr))
code_series.extend(code_arr)
return pd.DataFrame({date_col: date_series, 'col_dup': code_series}).sort_values(date_col).reset_index(drop=True)
else:
col = col_list[0]
dummy_df[col + '_dup'] = df[col].groupby((df[col] != df[col].shift()).cumsum()).cumcount()
return dummy_df[dummy_df[col + '_dup'] > 0].reset_index(drop=True)
find_repeats(df, ['1M'])
Series_Date 1M 1M_dup
0 2017-03-14 56.0 1
find_repeats(df, ['1M', 'SP'])
Series_Date col_dup
0 2017-03-14 1M
1 2017-03-15 SP
And here is another way using pandas diff:
def find_repeats(df, col_list, date_col='Series_Date'):
code_list = []
dates = list()
for col in col_list:
these_dates = df[date_col].iloc[np.where(df[col].diff().values == 0)[0]].values
code_arr = [col] * len(these_dates)
dates.extend(list(these_dates))
code_list.extend(code_arr)
return pd.DataFrame({date_col: dates, 'val_repeat': code_list}).sort_values(date_col).reset_index(drop=True)

Categories