Naming excel sheet based on data in the sheet using pandas - python

This is my current code:
import pandas as pd
from datetime import date, time, datetime
import numpy as np
####IMPORT FILES####
df=pd.read_csv("C:/Users/data.csv")
df1=pd.read_excel("C:/Users/user.xlsx")
####FILTERS####
data1 = df1[df1['SAG/non-SAG'] == "SAG"]
data = data1[data1['MEDIA TYPE'].isin(["TV/OTT", "TV", "TV "])]
c = np.array(data['ISCI or AD-ID #'])
for i in c:
c = np.append(c, i[0:-1])
print(c)
df_new = pd.DataFrame()
for i in c:
df_new = pd.concat([df_new, df[df['Isci Code'] == i]])
df_new = pd.DataFrame()
for i in c:
df_new = pd.concat([df_new, df[df['Isci Code'] == i]])
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
for i,key in enumerate(c):
print(key)
df_new.to_excel(writer, sheet_name=key)
writer.save()
And this is the output I am getting:
Excel output:
I have to group the highlighted ISCIs together and make sure the only ISCIs in the excel worksheet are the same as the name of the worksheet. For example: sheet XXH should contain row 2 and 10, sheet CCH should contain row 5, sheet AAH should contain row 3 and 6 and so on.
(Basically, code XXH and XX mean the same. We can ignore the last digit 'H' when grouping the ISCIs. )

Related

Keep enter space in text column when converting to csv

I have data in excel where is text with enter space in last column. Here is examples of my data:
If I convert using python to csv, my data looks like this:
I need the TEXT column will be like this:
This is my script:
import pandas as pd
import os
import numpy as np
WD = r'XXX'
os.chdir(WD)
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'.format(sn,FILE.replace('.xlsx','.csv'))
df = pd.read_excel(FILE,)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
cn = ['IN', 'NAME', 'TEXT']
df = df.reindex(columns = cn)
df.to_csv(OUTPUT_FILE,sep='|',encoding='utf-8-sig',index=False)
Do you have any idea?
I hope this works for your solution, (pip install xlsxwriter) before executing
Excel to csv:
import pandas as pd
df = pd.read_excel('./keep_enter.xlsx')
def replace_custom_func(x):
new_str = ''
if len(x) > 0:
for i in x.split('\n'):
new_str += f'"{i}"&CHAR(10)&'
return "=" + new_str[:-10]
else:
return x
df['Text'] = df['Text'].apply(lambda x: replace_custom_func(x))
df.to_csv('keep_enter1.csv', sep='|', index=False)
CSV to Excel:
df = pd.read_csv('./keep_enter1.csv', sep='|')
writer = pd.ExcelWriter('new_excel_replace12345.xlsx', engine='xlsxwriter')
# # Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)
# # Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column('C:D', None, format)
worksheet.write_formula(1, 2, df['Text'][0])
# # Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:

How do I export pandas processed data to excel and want to add a row to the top of the data? [duplicate]

I have multiple dataframes that look like this, the data is irrelevant.
I want it to look like this, i want to insert a title above the column headers.
I want to combine them into multiple tabs in an excel file.
Is it possible to add another row above the column headers and insert a Title into the first cell before saving the file to excel.
I am currently doing it like this.
with pd.ExcelWriter('merged_file.xlsx',engine='xlsxwriter') as writer:
for filename in os.listdir(directory):
if filename.endswith('xlsx'):
print(filename)
if 'brands' in filename:
some function
elif 'share' in filename:
somefunction
else:
some function
df.to_excel(writer,sheet_name=f'{filename[:-5]}',index=True,index_label=True)
writer.close()
But the sheet_name is too long, that's why I want to add the title above the column headers.
I tried this code,
columns = df.columns
columns = list(zip([f'{filename[:-5]}'] * len(df.columns), columns))
columns = pd.MultiIndex.from_tuples(columns)
df2 = pd.DataFrame(df,index=df.index,columns=columns)
df2.to_excel(writer,sheet_name=f'{filename[0:3]}',index=True,index_label=True)
But it ends up looking like this with all the data gone,
It should look like this
You can write data from sedond row first and then write to first cell your text:
df = pd.DataFrame({'col': list('abc'), 'col1': list('def')})
print (df)
col col1
0 a d
1 b e
2 c f
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow = 1, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
text = 'sometitle'
worksheet.write(0, 0, text)
writer.save()
Then for reading need:
title = pd.read_excel('test.xlsx', nrows=0).columns[0]
print (title)
sometitle
df = pd.read_excel('test.xlsx', skiprows=1)
print (df)
col col1
0 a d
1 b e
2 c f
You can use MultiIndex. There is an example:
import pandas as pd
df = pd.read_excel('data.xls')
header = pd.MultiIndex.from_product([['Title'],
list(df.columns)])
pd.DataFrame(df.to_numpy(), None , columns = header)
Also, I can share with you my solution with real data in Deepnote (my favorite tool). Feel free to duplicate and play with your own .xls:
https://deepnote.com/publish/3cfd4171-58e8-48fd-af21-930347e8e713

excel to doc in python

from docx.api import Document
import pandas as pd
document = Document("D:/tmp/test.docx")
tables = document.tables
df = pd.DataFrame()
for table in document.tables:
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
df.columns = ["Column1", "Column2"]
df.to_excel("D:/tmp/test.xlsx")
print df
Output
`>>>
Column1 Column2
0 Hello TEST
1 Est Ting
2 Gg ff
How to remove row and column 0,1,2 and how to add some images in this codes?
You can remove the index and header when export to excel, simply adding the following conditions:
df.to_excel("test.xlsx", header = None, index = False)
It can be done like this.
import pandas as pd
dataset = pd.DataFrame({'A':[1,2,3,4], 'B':[5,6,7,8]})
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
dataset.to_excel(writer, sheet_name = 'Data', index = False, header = False)
sheet_name = 'Images' #Sheet name in which the image will be generated
cell = 'B2' #Position of the image in w.r.t cell value
workbook = writer.book
worksheet = workbook.add_worksheet(sheet_name)
worksheet.insert_image(cell, 'Tmp.jpg') #Add image
workbook.close()
writer.save()

Storing data from a for loop in excel python

Want to store all the data from the for loop in an excel file, currently only storing the last output:
import pandas
import openpyxl
outputFile = 'outputData.xlsx'
workbook = openpyxl.load_workbook(os.getcwd() + '/sourceData.xlsx')
sheet = workbook["Sheet1"]
for i in range(2, sheet.max_row + 1):
<I do some ops to copy the data>
data = pyperclip.paste() #Want this data to be stored in the outPut excel, there is different input for every cell, so there will also be different output
df = pd.DataFrame({'Address':[mapData]})
df2 = pd.DataFrame()
df2 = df2.append(df, ignore_index=True, sort=False)
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()
just move df2 first initialization out of loop:
...
# define df2 here, just once
df2 = pd.DataFrame()
for i in range(2, sheet.max_row + 1):
... # your operations
df = pd.DataFrame({'Address':[mapData]})
# append df2 immediately after generation of df
df2 = df2.append(df, ignore_index=True, sort=False)
# save as before
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()

Compare two excel files and return non common rows

I have two excel files like below-
File1.xlsx
File2.xlsx
I want to compare each row and each column and eliminate common rows from both files (except the column heading). The output should look like below -
File1.xlsx
File2.xlsx
My code-
import pandas
from datetime import datetime
import numpy
df = pandas.read_excel('File1.xlsx')
FORMAT = ['col1','col2','col3']
df_selected = df[FORMAT]
df2 = pandas.read_excel('File2.xlsx')
FORMAT2 = ['col1','col2','col3']
df_selected2 = df2[FORMAT2]
def compare(row1,row2):
# print(row1,row2)
flag = False
i = 0
for t1, t2 in zip(row1, row2):
#if it is first value
#check dates
if(i==0):
formattedDT = datetime.strptime(t1, '%d-%b-%Y')
if(formattedDT!=t2):
flag = True
break
elif(t1!=t2):
flag = True
break
i=i+1
if(flag is False):
return 'matched'
else:
return 'mismatched'
def iterate(array1, array2):
result = array1
for index, row1 in enumerate(array1):
for row2 in array2:
if(compare(row1,row2)=='matched'):
del result[index]
break
df = pandas.DataFrame(result)
df.columns=['col1','col2','col3']
writer = pandas.ExcelWriter('output.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome', index=False)
writer.save()
iterate(df_selected.values,df_selected2.values)
But I am getting this error output-
ValueError: cannot delete array elements
Please help.
You can try this solution .It should give you desired result
df1 =pd.read_excel('File1.xlsx',parse_dates=['col1'],sep='\t')
df2 = pd.read_excel('File2.xlsx',parse_dates=['col1'],sep='\t',dayfirst=True)
temp_df = pd.concat([df1,df2])
temp_df.reset_index(inplace=True,drop=True)
temp_df.drop_duplicates(subset=['col1','col2','col3'], keep=False, inplace=True)
final_df1 = pd.merge(df1, temp_df, how='inner', on=['col1','col2','col3'])
final_df2 = pd.merge(df2, temp_df, how='inner', on=['col1','col2','col3'])
final_df1.to_excel('file_1_modified.xlsx',index=False)
final_df2.to_excel('file_2_modified.xlsx',index=False)
Below are snapshot for my result.Please note I have read from csv files ..so I have used pd.read_csv() on my snapshot. But it will give same result

Categories