from docx.api import Document
import pandas as pd
document = Document("D:/tmp/test.docx")
tables = document.tables
df = pd.DataFrame()
for table in document.tables:
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
df.columns = ["Column1", "Column2"]
df.to_excel("D:/tmp/test.xlsx")
print df
Output
`>>>
Column1 Column2
0 Hello TEST
1 Est Ting
2 Gg ff
How to remove row and column 0,1,2 and how to add some images in this codes?
You can remove the index and header when export to excel, simply adding the following conditions:
df.to_excel("test.xlsx", header = None, index = False)
It can be done like this.
import pandas as pd
dataset = pd.DataFrame({'A':[1,2,3,4], 'B':[5,6,7,8]})
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
dataset.to_excel(writer, sheet_name = 'Data', index = False, header = False)
sheet_name = 'Images' #Sheet name in which the image will be generated
cell = 'B2' #Position of the image in w.r.t cell value
workbook = writer.book
worksheet = workbook.add_worksheet(sheet_name)
worksheet.insert_image(cell, 'Tmp.jpg') #Add image
workbook.close()
writer.save()
Related
I have data in excel where is text with enter space in last column. Here is examples of my data:
If I convert using python to csv, my data looks like this:
I need the TEXT column will be like this:
This is my script:
import pandas as pd
import os
import numpy as np
WD = r'XXX'
os.chdir(WD)
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'.format(sn,FILE.replace('.xlsx','.csv'))
df = pd.read_excel(FILE,)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
cn = ['IN', 'NAME', 'TEXT']
df = df.reindex(columns = cn)
df.to_csv(OUTPUT_FILE,sep='|',encoding='utf-8-sig',index=False)
Do you have any idea?
I hope this works for your solution, (pip install xlsxwriter) before executing
Excel to csv:
import pandas as pd
df = pd.read_excel('./keep_enter.xlsx')
def replace_custom_func(x):
new_str = ''
if len(x) > 0:
for i in x.split('\n'):
new_str += f'"{i}"&CHAR(10)&'
return "=" + new_str[:-10]
else:
return x
df['Text'] = df['Text'].apply(lambda x: replace_custom_func(x))
df.to_csv('keep_enter1.csv', sep='|', index=False)
CSV to Excel:
df = pd.read_csv('./keep_enter1.csv', sep='|')
writer = pd.ExcelWriter('new_excel_replace12345.xlsx', engine='xlsxwriter')
# # Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)
# # Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column('C:D', None, format)
worksheet.write_formula(1, 2, df['Text'][0])
# # Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:
I am working with Excel and I have to export some columns to another one but this second one is a template having some colors, the logo of a company and stuff.
Is there any way to preserve the look and functionality that template.xlsx has?
My code:
import pandas as pd
#variables for source file, worksheets, and empty dictionary for dataframes
spreadsheet_file = pd.ExcelFile('example.xlsx')
worksheets = spreadsheet_file.sheet_names
appended_data = {}
cat_dic = {"Part Number":"CÓDIGO", "QTY":"QT", "Description":"DESCRIÇÃO", "Material":"MATERIAL", "Company":"MARCA","Category":"OPERAÇÃO"}
d = {}
for sheet_name in worksheets:
df = pd.read_excel(spreadsheet_file, sheet_name)
#Getting only the columns asked: "Part Number","QTY","Description","Material","Company","Category"
df = df[["Part Number","QTY","Description","Material","Company","Category"]]
#Organizing info:
#1º By Category
#2º By Description
df = df.sort_values(['Category', 'Description'], ascending = [False, False])
appended_data = df.to_dict()
#Change Key names
d = dict((cat_dic[key], value) for (key, value) in appended_data.items())
#Exporting Data
df2 = pd.DataFrame(d)
df2.to_excel('template2.xlsx',sheet_name='Projeto',index=False)
Example:
Template:
My output:
Thanks in advance for any help.
You will need to use openpyxl if you want to only update the text and keep the format, color, etc. as-is in the template. Updated code below. Note that
I have not taken your df2 code as the template file already has the new headers. Only updating the data from each worksheet into the file
You can read each worksheet using read_excel, but writing will need to be using the openpyxl.load_workbook and finally saving the file once all worksheets are read
Open the template file shown in pic above using load_workbook before the FOR loop and save to a new file template2 after the FOR loop is complete
spreadsheet_file = pd.ExcelFile('example.xlsx')
worksheets = spreadsheet_file.sheet_names
#cat_dic = {"Part Number":"CÓDIGO", "QTY":"QT", "Description":"DESCRIÇÃO", "Material":"MATERIAL", "Company":"MARCA","Category":"OPERAÇÃO"}
#d = {}
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
wb=openpyxl.load_workbook('Template.xlsx') ##Your Template file
ws=wb['Sheet1']
rownumber=2 ##Skip 2 rows and start writing from row 3 - first two are headers in template file
for sheet_name in worksheets:
df = pd.read_excel(spreadsheet_file, sheet_name)
#Getting only the columns asked: "Part Number","QTY","Description","Material","Company","Category"
df = df[["Part Number","QTY","Description","Material","Company","Category"]]
#Organizing info:
#1º By Category
#2º By Description
df = df.sort_values(['Category', 'Description'], ascending = [False, False])
rows = dataframe_to_rows(df, index=False, header=False) ## Read all rows from df, but don't read index or header
for r_idx, row in enumerate(rows, 1):
for c_idx, value in enumerate(row, 1):
ws.cell(row=r_idx+rownumber, column=c_idx, value=value) Write to cell, but after rownumber + row index
rownumber += len(df) ##Move the rownumber to end, so next worksheet data comes after this sheet's data
wb.save('template2.xlsx')
I have multiple dataframes that look like this, the data is irrelevant.
I want it to look like this, i want to insert a title above the column headers.
I want to combine them into multiple tabs in an excel file.
Is it possible to add another row above the column headers and insert a Title into the first cell before saving the file to excel.
I am currently doing it like this.
with pd.ExcelWriter('merged_file.xlsx',engine='xlsxwriter') as writer:
for filename in os.listdir(directory):
if filename.endswith('xlsx'):
print(filename)
if 'brands' in filename:
some function
elif 'share' in filename:
somefunction
else:
some function
df.to_excel(writer,sheet_name=f'{filename[:-5]}',index=True,index_label=True)
writer.close()
But the sheet_name is too long, that's why I want to add the title above the column headers.
I tried this code,
columns = df.columns
columns = list(zip([f'{filename[:-5]}'] * len(df.columns), columns))
columns = pd.MultiIndex.from_tuples(columns)
df2 = pd.DataFrame(df,index=df.index,columns=columns)
df2.to_excel(writer,sheet_name=f'{filename[0:3]}',index=True,index_label=True)
But it ends up looking like this with all the data gone,
It should look like this
You can write data from sedond row first and then write to first cell your text:
df = pd.DataFrame({'col': list('abc'), 'col1': list('def')})
print (df)
col col1
0 a d
1 b e
2 c f
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', startrow = 1, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
text = 'sometitle'
worksheet.write(0, 0, text)
writer.save()
Then for reading need:
title = pd.read_excel('test.xlsx', nrows=0).columns[0]
print (title)
sometitle
df = pd.read_excel('test.xlsx', skiprows=1)
print (df)
col col1
0 a d
1 b e
2 c f
You can use MultiIndex. There is an example:
import pandas as pd
df = pd.read_excel('data.xls')
header = pd.MultiIndex.from_product([['Title'],
list(df.columns)])
pd.DataFrame(df.to_numpy(), None , columns = header)
Also, I can share with you my solution with real data in Deepnote (my favorite tool). Feel free to duplicate and play with your own .xls:
https://deepnote.com/publish/3cfd4171-58e8-48fd-af21-930347e8e713
This is my current code:
import pandas as pd
from datetime import date, time, datetime
import numpy as np
####IMPORT FILES####
df=pd.read_csv("C:/Users/data.csv")
df1=pd.read_excel("C:/Users/user.xlsx")
####FILTERS####
data1 = df1[df1['SAG/non-SAG'] == "SAG"]
data = data1[data1['MEDIA TYPE'].isin(["TV/OTT", "TV", "TV "])]
c = np.array(data['ISCI or AD-ID #'])
for i in c:
c = np.append(c, i[0:-1])
print(c)
df_new = pd.DataFrame()
for i in c:
df_new = pd.concat([df_new, df[df['Isci Code'] == i]])
df_new = pd.DataFrame()
for i in c:
df_new = pd.concat([df_new, df[df['Isci Code'] == i]])
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
for i,key in enumerate(c):
print(key)
df_new.to_excel(writer, sheet_name=key)
writer.save()
And this is the output I am getting:
Excel output:
I have to group the highlighted ISCIs together and make sure the only ISCIs in the excel worksheet are the same as the name of the worksheet. For example: sheet XXH should contain row 2 and 10, sheet CCH should contain row 5, sheet AAH should contain row 3 and 6 and so on.
(Basically, code XXH and XX mean the same. We can ignore the last digit 'H' when grouping the ISCIs. )
Trying to find out how to print to a specific column/row similar to how
pd.to_excel(startcol = 1, startrow = 1) works. I have to do this in an open excel workbook, and found the library xlwings. I'm currently using openpyxl, how would I do this in xlwings? I read the documentation printing to specific cells like A1, but not by specifying columns/rows.
#Write to Excel
book = load_workbook('Test.xlsx')
writer = pd.ExcelWriter('Test.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
def addtoexcel(df, row):
i = 0
df[["val1", "val2"]] = df[["val1", "val2"]].apply(pd.to_numeric)
line = int(df.loc[i][1])
for i in range(1, line+1):
if line ==i:
df = df.T
df.to_excel(writer, "test", index = False, header = False, startcol = line+2, startrow = row)
How can I print in xlwings by specifying column/row like (1,1)?
You can easily print a pandas dataframe to excel using xlwings. The range object takes a row and a column number as arguments (or just a cell reference as a string). Consider the following code:
import xlwings as xw
import pandas as pd
row = 1
column = 2
path = 'your/path/file.xlsx'
df = pd.DataFrame({'A' : [5,5,5],
'B' : [6,6,6]})
wb = xw.Book(path)
sht = wb.sheets["Sheet1"]
sht.range(row, column).value = df
You can also add options to include index/header:
sht.range(row, column).options(index=False, header=False).value = df