How to split text of a dataframe column into multiple columns? - python

I'm trying to retrieve a string from an excel sheet and split it into words then print it or write it back into a new string but when retrieving the data using pandas and trying to split it an error occurs saying dataframe doesn't support split function
the excel sheet has this line in it:
I expect and output like this:
import numpy
import pandas as pd
df = pd.read_excel('eng.xlsx')
txt = df
x = txt.split()
print(x)
AttributeError: 'DataFrame' object has no attribute 'split'

That's because you are applying split() function on a DataFrame and that's not possible.
import pandas as pd
import numpy as np
def append_nan(x, max_len):
"""
Function to append NaN value into a list based on a max length
"""
if len(x) < max_len:
x += [np.nan]*(max_len - len(x))
return x
# I define here a dataframe for the example
#df = pd.DataFrame(['This is my first sentence', 'This is a second sentence with more words'])
df = pd.read_excel('your_file.xlsx', index=None, header=None)
col_names = df.columns.values.tolist()
df_output = df.copy()
# Split your strings
df_output[col_names[0]] = df[col_names[0]].apply(lambda x: x.split(' '))
# Get the maximum length of all yours sentences
max_len = max(map(len, df_output[col_names[0]]))
# Append NaN value to have the same number for all column
df_output[col_names[0]] = df_output[col_names[0]].apply(lambda x: append_nan(x, max_len))
# Create columns names and build your dataframe
column_names = ["word_"+str(d) for d in range(max_len)]
df_output = pd.DataFrame(list(df_output[col_names[0]]), columns=column_names)
# Then you can save it
df_output.to_excel('output.xlsx')

Related

How to put JSON chart data into pandas dataframe?

I have downloaded the har file of an interactive chart and have the datapoints in the following format:
'{"x":"2022-03-28T00:00:00Z"', '"value":0.2615}',
'{"x":"2022-03-29T00:00:00Z"', '"value":0.2573}',
'{"x":"2022-03-30T00:00:00Z"', '"value":0.272}', ...
What would be the easiest way to convert this into a pandas dataframe?
Both the date and the value should be columns of the dataframe.
First problem is that every element is in inside ' ' so it treads it as two items/columns but it should treat it as single item/doctionary. It may need to replace ', ' with , to have normal string with JSON which you can conver to Python dictionary using module json
text = open(filename).read()
text = text.replace("', '", ",")
and later you can use io.StringIO() to load it from text.
It needs quotechar="'" to read it correctly
df = pd.read_csv(io.StringIO(text), names=['data', 'other'], quotechar="'")
next you can convert every JSON string to python dictionary
df['data'] = df['data'].apply(json.loads)
and next convert dictionary to pd.Series which you can split to columns
df[['x','value']] = df['data'].apply(pd.Series)
Finally you may remove columns data, other
del df['data']
del df['other']
Full working example
text = """'{"x":"2022-03-28T00:00:00Z"', '"value":0.2615}',
'{"x":"2022-03-29T00:00:00Z"', '"value":0.2573}',
'{"x":"2022-03-30T00:00:00Z"', '"value":0.272}',"""
import pandas as pd
import io
import json
#text = open(filename).read()
text = text.replace("', '", ",")
#print(text)
# read from string
df = pd.read_csv(io.StringIO(text), names=['data', 'other'], quotechar="'")
# convert string to dictionary
df['data'] = df['data'].apply(json.loads)
# split dictionary in separated columns
df[['x','value']] = df['data'].apply(pd.Series)
# remove some columns
del df['data']
del df['other']
print(df)
Result:
x value
0 2022-03-28T00:00:00Z 0.2615
1 2022-03-29T00:00:00Z 0.2573
2 2022-03-30T00:00:00Z 0.2720
You can also write some part in one line
df[['x','value']] = df['data'].apply(lambda item: pd.Series(json.loads(item)))
or split it separatelly (using .str[index] on dictionary)
df['data'] = df['data'].apply(json.loads)
df['x'] = df['data'].str['x']
df['value'] = df['data'].str['value']
BTW:
you may also need to convert x from string to datetime
df['x'] = pd.to_datetime(df['x'])

How to insert list of new column names to a dataframe?

I have created a data frame from an existing excel file. I have extracted the columns of the dataframe and have done strip() and lower() operations. I am finding issue of inserting new list of columns into the dataframe.
Here is the below code :
import pandas as pd
import itertools
def fetch_columns(req_columns, input_columns):
keys = req_columns.keys()
def func1(key):
return req_columns[key]
def func2(col):
if col in input_columns:
return col
alias_col = list(map(func1, keys))
flat_list = list(itertools.chain(*alias_col))
col_names = list(filter(func2, flat_list))
rename_col = dict(zip(col_names, keys))
return {'actual_columns': col_names, 'rename_columns': rename_col}
def read_file(file_name):
df = pd.read_excel(file_name)
print(df)
input_columns = list(df.columns.str.strip().str.lower())
print(input_columns)
df = df[input_columns]
print(df)
req_columns = {'emp_id': ['uid', 'empid'], 'emp_name': ['name', 'empname']}
result = fetch_columns(req_columns, input_columns)
columns = list(result.values())
df = df[columns[0]]
print(df)
df.rename(columns=columns[1], inplace=True)
return df
df = read_file('col_read.xlsx')
print(df.head())
data of 'col_read.xlsx':
UID NAME empage age
1 skd 23 23
I have extracted UID, NAME, empage, age from the excel file and created a pandas dataframe and have applied strip() and lower() methods on it. The newlist of lowercased and empty space stripped column names is what I want to replace with the old dataframe column names.And that is where I am finding error.
KeyError: "['uid', 'name'] not in index"
Any suggestion would be appreciated

Python Pandas .iloc Download columns by number

I want to download column numbers, eg 1,3,2. In the param.txt file I have only such an entry
import pandas as pd
import numpy as np
df = pd.read_csv('sample1.csv')
with open('param.txt') as f:
s = f.read()
b = df.iloc[:, [s]]
print(b.to_string(index=False))
When I start a script
raise IndexError(f".iloc requires numeric indexers, got {arr}")
IndexError: .iloc requires numeric indexers, got ['1,3,2']
How to simply change from such a form to numeric
Thank you for every help
This should work assuming f.read() returns "1,2,3"
import pandas as pd
import numpy as np
df = pd.read_csv('sample1.csv')
with open('param.txt') as f:
s = f.read() # Assuming this is a string such as "1,2,3"
s = s.split(",") # Split string to list where there are commas ["1","2","3"]
s = [int(x) for x in s] # Convert entries from string to int [1,2,3]
b = df.iloc[:, s] # No need for brackets since s is already a list
print(b.to_string(index=False))

loop through words in a csv file and replace in python

I have a csv file with three columns, namely (cid,ccontent,value) . And I want to loop through each word in ccontent column and translate the words individually.
I found this code for translating a row but I want to translate each word not the row.
How to write a function in Python that translates each row of a csv to another language?
from googletrans import Translator
import pandas as pd
headers = ['A','B','A_translation', 'B_translation']
data = pd.read_csv('./data.csv')
translator = Translator()
# Init empty dataframe with much rows as `data`
df = pd.DataFrame(index=range(0,len(data)), columns=headers)
def translate_row(row):
''' Translate elements A and B within `row`. '''
a = translator.translate(row[0], dest='Fr')
b = translator.translate(row[1], dest='Fr')
return pd.Series([a.origin, b.origin, a.text, b.text], headers)
for i, row in enumerate(data.values):
# Fill empty dataframe with given serie.
df.loc[i] = translate_row(row)
print(df)
Thank you
You can try along the lines of, using list comprehension:
def translate_row(row):
row0bywords = [translator.translate(eachword, dest='Fr') for eachword in row[0]]
orw1bywords = [translator.translate(eachword, dest='Fr') for eachword in row[1]]
return row0bywords, row1bywords

Pandas apply, how to combine the results returned

In python pandas apply, the applied function takes each row of the Dataframe and will return another Dataframe, how can I get the combination of (append) these Dataframes returned through applying? For example:
# this is an example
import pandas as pd
import numpy as np
def newdata(X, data2):
return X - data2[data2['no']!=X['no']].sample(1,random_state=100)
col = ['no','a','b']
data1 = pd.DataFrame(np.column_stack((range(5),np.random.rand(5,2))),columns=col)
data2 = pd.DataFrame(np.column_stack((range(3),np.random.rand(3,2))),columns=col)
Newdata = data1.apply(newdata, args=(data2,), axis=1)

Categories