Python: Writing a List to a Row in Excel - python

I'm trying to read in data from a text file, create one long string of everything contained in that text file, and split it by space. I'm then trying to send that list split by space to printStringArray and have it print to an Excel sheet. I'm having trouble figuring out how to keep track of what row and column the data should be input into.
rowNum = 1
def createStringArray(theFilePath):
theFinalString = ""
with open(theFilePath) as file_in:
for line in file_in:
lineToString = str(line)
theCompleteString = lineToString.split()
printStringArray(theCompleteString)
def printStringArray(theStringArray):
with xlsxwriter.Workbook('testingThis.xlsx') as workbook:
worksheet = workbook.add_worksheet()
colNum = 1
global rowNum
for data in theStringArray:
worksheet.write(rowNum, colNum, data)
colNum = colNum + 1
rowNum = rowNum + 1
EDITED:
def printStringArray(theStringArray):
with xlsxwriter.Workbook('testingThis.xlsx') as workbook:
worksheet = workbook.add_worksheet()
colNum = 0
global rowNum
for data in theStringArray:
worksheet.write(rowNum, colNum, str(data))
print(rowNum)
print(colNum)
print(data)
colNum = colNum + 1
rowNum = rowNum + 1
I added some prints in to see where I was going wrong, but all the numbers are exactly what I want them to be for rowNum, colNum, and data. Right now it is only printing the very last line.
EDIT #2
rowNum = 0
with xlsxwriter.Workbook('testingThis.xlsx') as workbook:
worksheet = workbook.add_worksheet()
def createStringArray(theFilePath):
theFinalString = ""
with open(theFilePath) as file_in:
for line in file_in:
lineToString = str(line)
theCompleteString = lineToString.split()
printStringArray(theCompleteString)
for aString in theCompleteString:
theFinalString = theFinalString + aString + "--"
print(theFinalString)
def printStringArray(theStringArray):
colNum = 0
global rowNum
worksheet.write(15, 15, "Aapple")
for data in theStringArray:
worksheet.write(rowNum, colNum, str(data))
print(rowNum)
print(colNum)
print(data)
colNum = colNum + 1
rowNum = rowNum + 1

Here is my attpemt to read a textfile and write the content to an excel sheet, maybe this can help.
I use the write_row() function to write a list of strings to a row. This way I don't need to take care of the correct column numbers.
import xlsxwriter
# read textfile line by line
with open(path2txt, 'r') as txtfile:
lines_raw = txtfile.readlines()
# remove newline and whitespace on every line
lines = [line.strip() for line in lines_raw]
# write lines to worksheet,
# here we use "write_row()" to write the list of words column by column
with xlsxwriter.Workbook('test.xlsx') as workbook:
sheet = workbook.add_worksheet()
row, col = 0, 0
for line in lines:
parts = line.split()
if len(parts)>0: # only write nonempty lines
sheet.write_row(row, col, parts)
row += 1

Related

How to iterate thorugh multiple excel sheets using openpyxl library in Python?

I am using Openpyxl library to read xlsx file and extract few contents and add more strings to my txt output file.The excel file I am currently using contain sheets with name Summary and Employee. My below code is working fine for my current excel file. Now the issue is I would to use the same code for reading another excel file containing more sheets whose sheetnames I am not sure of. So in my code line ws = wb['Employee']. The sheetname will change all the time. However, One thing I am sure about is I don't want to read any data from sheet1. All the data extraction will occur from sheet2 onwards in all the xlsx files. I am not sure how to proceed from here so any help will be appreciated.
Thanks in advance for your time and efforts!
Code:
from openpyxl import load_workbook
data_file='\\test.xlsx'
# Load the entire workbook.
wb = load_workbook(data_file)
ws = wb['Employee'] #Manually adding sheet name here
mylines={"Column_name":[],"Column_Type":[]} #Getting 2 columns data from row 6
type_strs = {
'String': 'VARCHAR(256)',
'Numeric': 'NUMBER',
'Date': 'NUMBER(4,0)',
'Int': 'NUMBER'
}
for index, value in enumerate(mylines["Column_Type"]):
mylines["Column_Type"][index] = type_strs.get(value, value)
for i in range(6, ws.max_row+1):
name = ws.cell(row=i, column=1).value
name1=ws.cell(row=i, column=2).value
mylines["Column_name"].append(name) #Appending dictionary key "Column_name"
mylines["Column_Type"].append(name1) #Appending dictionay key "Column_type"
for index, value in enumerate(mylines["Column_Type"]):
mylines["Column_Type"][index] = type_strs.get(value, value)
theString = " "
for i in range(len(mylines['Column_name'])):
theString += mylines['Column_name'][i] + " " + mylines['Column_Type'][i]
if i < len(mylines['Column_name'])-1:
theString += ", "
outputFile = open('/output.txt', 'w') # Text file Output
outputFile.write("CREATE TABLE TRANSIENT TABLE STG_EMPLOYEE({});".format(theString) + "\n")
outputFile.close() #Closing file
Updated Code based on SO User comment:
from openpyxl import load_workbook
data_file='\\test.xlsx'
# Load the entire workbook.
wb = load_workbook(data_file)
#ws = wb['Employee'] #Manually adding sheet name here
mylines={"Column_name":[],"Column_Type":[]} #Getting 2 columns data from row 6
type_strs = {
'String': 'VARCHAR(256)',
'Numeric': 'NUMBER',
'Date': 'NUMBER(4,0)',
'Int': 'NUMBER'
}
for index, value in enumerate(mylines["Column_Type"]):
mylines["Column_Type"][index] = type_strs.get(value, value)
skip = True
for ws in wb.worksheets:
if skip == True:
skip = False
else:
for i in range(6, ws.max_row+1):
name = ws.cell(row=i, column=1).value
name1=ws.cell(row=i, column=2).value
mylines["Column_name"].append(name) #Appending dictionary key "Column_name"
mylines["Column_Type"].append(name1) #Appending dictionay key "Column_type"
for index, value in enumerate(mylines["Column_Type"]):
mylines["Column_Type"][index] = type_strs.get(value, value)
theString = " "
for i in range(len(mylines['Column_name'])):
theString += mylines['Column_name'][i] + " " + mylines['Column_Type'][i]
if i < len(mylines['Column_name'])-1:
theString += ", "
outputFile = open('/output.txt', 'w') # Text file Output
outputFile.write("CREATE TABLE TRANSIENT TABLE STG_EMPLOYEE({});".format(theString) + "\n")
outputFile.close() #Closing file
Excel data
<Sheet 1 Name -> Summary Sheet: Empty
<Sheet 2 Name -> Employee Sheet
File Name: Employee
Sheet Name: Employee
File Type: csv
Field Name Type
Name String
Salary Numeric
Date Date
Phone Int
<Sheet 3 Name-> Employee1 Sheet
File Name: Employee
Sheet Name: Employee1
File Type: csv
Field Name Type
Employee Name Date
Employee Salary Int
Employment Date Int
Office Phone Int
To iterate through all worksheets in a workbook and read data in them (except the first worksheet, remove the ws = wb['Employee']
Use a for loop (insert before for i in range(5,... as this
skip = True
for ws in wb.worksheets:
if skip == True:
skip = False
else:
for i in range(6, ws.max_row+1):
name = ws.cell(row=i, column=1).value
....
This will read each sheet and append data to mylines, except the first sheet
Second Update
As you mentioned in below comment, to add a new line with the new SQL query, please make these additional changes
Add another entry to dictionary to indicate new line as below (careful to ensure the lines execute after all lines in a particular sheet are read)
Edit the String formation so that once a NewLine is seen, that string is written to the output file. Do note that the NewFile boolean value will overwrite any file that is there. Multiple lines will be appended post that.
skip = True
for ws in wb.worksheets:
if skip == True:
skip = False
else:
for i in range(6, ws.max_row+1):
name = ws.cell(row=i, column=1).value
print(i, name)
name1=ws.cell(row=i, column=2).value
print(name1)
mylines["Column_name"].append(name) #Appending dictionary key "Column_name"
mylines["Column_Type"].append(name1) #Appending dictionay key "Column_type"
for index, value in enumerate(mylines["Column_Type"]):
mylines["Column_Type"][index] = type_strs.get(value, value)
mylines["Column_name"].append('NextLine')
mylines["Column_Type"].append('NextLine')
theString = " "
NewFile = True
sheetList = wb.sheetnames
tabIndex = 1
for i in range(len(mylines['Column_name'])):
if(mylines['Column_name'][i] != 'NextLine'):
theString += mylines['Column_name'][i] + " " + mylines['Column_Type'][i]
theString += ", "
else:
theString = theString[:-2]
if NewFile:
NewFile = False
outputFile = open('output.txt', 'w') # Text file Output
print("New file ", theString)
else:
outputFile = open('output.txt', 'a')
print("Not new file ", theString)
outputFile.write("CREATE TABLE TRANSIENT TABLE STG_" + sheetList[tabIndex] +"({});".format(theString) + "\n")
outputFile.close()
tabIndex += 1
theString = " "

Delete timestamp in my output file in excel using python

I need to remove timestamps in my file. It should only return the name. My source text file looks like this
7:52:01 AM sherr
hello GOOD morning .
おはようございます。
7:52:09 AM sherr
Who ?
誰?
7:52:16 AM sherr
OK .
わかりました
and my code looks like this
from openpyxl import Workbook
import copy
wb = Workbook()
with open('chat_20220207131707.txt', encoding='utf-8') as sherr:
row = 1
column = 1
ws = wb.active
for line in sherr:
ws.cell(row=row, column=column, value=line.strip())
if (column := column + 1) > 3:
row += 1
column = 1
for row in ws.iter_rows():
for cell in row:
alignment = copy.copy(cell.alignment)
alignment.wrapText=True
cell.alignment = alignment
wb.save('sherrplease.xlsx')
If the file always has the same structure, which it certainly looks like, this can be done with simple splitting of the string in question.
from openpyxl import Workbook
import copy
wb = Workbook()
with open('chat_20220207131707.txt', encoding='utf-8') as sherr:
row = 1
column = 1
ws = wb.active
for line in sherr:
if column == 1:
## split the line and rejoin
value = " ".join(line.strip().split(' ')[2:])
else:
value = line.strip()
ws.cell(row=row, column=column, value=value)
if (column := column + 1) > 3:
row += 1
column = 1
for row in ws.iter_rows():
for cell in row:
alignment = copy.copy(cell.alignment)
alignment.wrapText=True
cell.alignment = alignment
wb.save('sherrplease.xlsx')

Is there a way to read and alter the contents of a huge csv file in PyCharm?

I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()

writing Excel file while using for loop

I am trying to write data to an Excel file, during a for loop.
But what I am getting is a single line containing the last data received by the loop.
I have tried a couple of different methods but came short..
2 tries are list below
Any Ideas ?
def write_excel(x):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
worksheet = workbook.add_worksheet()
df = pd.DataFrame(
{'ID':[x],
'mail_one':[Email],
'second_mail':[second_mail],
'Num':[Num],
'date':[Date]})
row_num = 0
for key, value in df.items():
worksheet.write(0, row_num, key)
worksheet.write_row(1, row_num, value)
row_num += 1
workbook.close()
#df = pd.DataFrame(
# {'ID':[x],
# 'mail_one':[Email],
# 'second_mail':[second_mail],
# 'Num':[Num],
# 'date':[Date]})
# writer = ExcelWriter('ID_Num.xlsx')
# df.to_excel(writer,'ID_Num',index=False)
# writer.save()
if __name__ == "__main__":
for x in List:
my_dic = {}
my_dict["ID"] = x
my_dict["mail_one"] = Email
my_dict["second_mail"] = second_mail
my_dict["Num"] = str(Num)
my_dict["date"] = Date
print(my_dict)
write_excel(x)
I don't have xlsxwriter so I cannot test. The documentation says that it cannot modify an existing file so I suspect that every iteration of for x in List: you are over-writing your file (workbook = xlsxwriter.Workbook('ID_Num.xlsx')).
You can make multiple files with these changes:
def write_excel(x,i):
workbook = xlsxwriter.Workbook(f'ID_Num{i}.xlsx')
...
# and
for i,x in enumerate(List):
...
write_excel(x,i)
Or you could accumulate multiple dictionaries and pass all of them to your function
data = []
for x in List:
my_dic = {}
...
data.append(my_dic)
write_excel(data)
Changing the function to iterate over those dicts; making a new sheet for each one
def write_excel(data):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
for sht in data:
worksheet = workbook.add_worksheet()
df = pd.DataFrame(...
row_num = 0
for key, value in df.items():
worksheet.write(...
worksheet.write_row(...
row_num += 1
workbook.close()

Using xlwings to open excel sheet. Need to search a string and print full line

I'm using xlwings to open excel sheet. Need to search a string in a specific column and print full line of the string without search item and until new line(\n). Output should be in new column of same sheet.
Input:
search string: [game]
Output:
import xlwings as xw
open excel file using xlwings
filename = r'input.xlsx'
book = xw.Book(filename)
sheet = book.sheets[0]
find the last row of the sheet on a specific range in this case from column 'A'
lrow = sheet.range('A' + str(sheet.cells.last_cell.row)).end('up').row
declare a separate variable for the string that you will search and the column where your output will be located.
search_string = '[game]'
sheet.range('B1').value = 'output'
output_index = 2
now loop through that range to see if your search_string is in that range
for i in range(1, lrow + 1):
if search_string in str(sheet.range('A{}'.format(i)).value):
temp = str(sheet.range('A{}'.format(i)).value)
temp = temp.split(search_string)[1]
if '[' in temp:
temp = temp.split('[')[0]
sheet.range('B{}'.format(output_index)).value = temp
output_index += 1
book.save()
book.close()
Below is the full code >>
import xlwings as xw
filename = r'input.xlsx'
book = xw.Book(filename)
sheet = book.sheets[0]
lrow = sheet.range('A' + str(sheet.cells.last_cell.row)).end('up').row
search_string = '[game]'
sheet.range('B1').value = 'output'
output_index = 2
for i in range(1, lrow + 1):
if search_string in str(sheet.range('A{}'.format(i)).value):
temp = str(sheet.range('A{}'.format(i)).value)
temp = temp.split(search_string)[1]
if '[' in temp:
temp = temp.split('[')[0]
sheet.range('B{}'.format(output_index)).value = temp
output_index += 1
book.save()
book.close()

Categories