Pandas fetch line from datas into columns - python

i have a problem with my script.
I`ve made a script that fetch some datas from lines of a raw.txt file into columns to excel.
Its worked at the beggining but now when i added more datas in the file its not, if you can help me or have another solve.
**This is my script:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
import pandas as pd
import time
def clearcasevobs():
pathdest = path_dir()
dest = createdir()
timestr = time.strftime("%Y-%m-%d")
txtName = rf"{pathdest}\{timestr}-clearcaseRawData-vobsDetails.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
# header
workSheet.write(0, 0, "Tag")
workSheet.write(0, 1, "CreateDate")
workSheet.write(0, 2, "Created By")
workSheet.write(0, 3, "Storage Host Pathname")
workSheet.write(0, 4, "Storage Global Pathname")
workSheet.write(0, 5, "DB Schema Version")
workSheet.write(0, 6, "Mod_by_rem_user")
workSheet.write(0, 7, "Atomic Checkin")
workSheet.write(0, 8, "Owner user")
workSheet.write(0, 9, "Owner Group")
workSheet.write(0, 10, "ACLs enabled")
workSheet.write(0, 11, "FeatureLevel")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "versioned":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("created") >= 0:
entries += 1
txt = arr
s = txt.index("created")
e = txt.index("by")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
txt2 = txt[e + 3:]
fullentry.append(txt2)
elif arr.find("VOB storage host:pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("VOB storage global pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("database schema version:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("modification by remote privileged user:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("tomic checkin:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("owner ") >= 0:
entries += 1
txt = arr
s = txt.index('owner')
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("group tmn") >= 0:
if arr.find("tmn/root") == -1:
entries += 1
txt = arr
s = txt.index('group')
entries += 1
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("ACLs enabled:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("FeatureLevel =") >= 0:
entries += 1
txt = arr
txt = txt[-1:]
fullentry.append(txt)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseReport.xls"))
fp.close()
This is my file.txt datas(the file that script need to work and doesnt):
https://paste.pythondiscord.com/sedowagigo
This is how should output as excel file::
Details:
-The script that i did basically should read the datas inside the .txt file and based on the keywords that i put to create the columns and add the wanted datas in the right columns, but also should ignore the sh*t/raw datas dat dont need to be processed.
-First time was working thats why i have also the output photo .xls, but now its not working anymore because i added more datas inside and have more junkies... If someone can help me or you know other method im open to all
This is the old .txt file that i tested the script and works: https://paste.pythondiscord.com/ohewatahuv
This is the error that i received when i use the script on the new file that i attach inside the pastebin at the beggining ( https://paste.pythondiscord.com/sedowagigo ):
Ty for help!

Related

How to iterate over the rows from 2 files, compare the values and update the value in a file when the condition is met?

For changing the values from 10 to 18, 19 or 20, I am splitting the string, access the substrings and then trying to change it. Its working but just not changing the values. Here is the solution I am trying to implement:
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
readme = open(topoFileName, "r")
Lines = readme.readlines()
readme.close()
newFile = open(newFileName,"w")
for row in oldLines:
for line in Lines:
tmp = line.split()
list = row.rstrip()
tmp1 = list.split()
newFile.write(row)
if row.find("BEG_ELEM_DATA") > -1:
if tmp[0] == tmp1[0]:
if tmp[2] == 1 and tmp[3] == 0:
# it is magnet, value 18
newFile.write(tmp1.replace(tmp1[1], "18"))
elif tmp[2] == 1 and tmp[3] == 1:
# it is iron, value 19
newFile.write(tmp1.replace(tmp1[1], "19"))
else:
# it is air, value 20
newFile.write(tmp1.replace(tmp1[1], "20"))
newFile.close()
I would really appreciate it if you could able to solve this problem in above script, then I guess it should work.
I'm also still a beginner in Python, but I tried to solve your problem and here is my solution:
I guess there are way better ways to do it because here you have to import all data to a dataframe before comparing it.
Also I don't know if you can read your data with pd.read_csv to a dataframe because I don't know *.hmo and *.topo
import pandas as pd
df = pd.read_csv('tryout.csv', delimiter=';')
df2 = pd.read_csv('density.csv', delimiter=';')
for idx, row in df.iterrows():
for idx2, row2 in df2.iterrows():
if row[0] == row2[0]:
if row2[2] == 1 and row2[3] == 0 :
# it is magnet, value 18
row[1] = 18
elif row2[2] == 1 and row2[3] == 1 :
# it is iron, value 19
row[1] = 19
else:
# it is air, value 20
row[1] = 20
df.to_csv('new_tryout.csv')
What my code is doing here, it loads both files to dataframes. Then iterate over every line to compare where the ID in both files is the same (e.g 3749).
If true there are the 3 if statements whether it is magnet/iron/air and change the value in df to the right number.
At the end save the new df to a new file 'new_tryout.csv'
I created 2 testfiles for it and it worked the way it should.
Finally, here is the solution you were searching for.
import pandas as pd
df2 = pd.read_csv('Density.topo', header = 0, names = list('ABCD'), delimiter=r'\s+', skiprows=1)
df2[['C', 'D']]= df2[['C', 'D']].round()
new_file_content=''
with open('tryout.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
search_idx = df2[df2['A'] == ID_to_search_for].index[0]
if df2['C'][search_idx] == 1 and df2['D'][search_idx] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif df2['C'][search_idx] == 1 and df2['D'][search_idx] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
if you don't want to use dataframes, you can do it like this:
with open('density.topo') as f:
lists_of_list = [line.rstrip().split() for line in f]
new_file_content=''
with open('tryout_test.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
for idx, sublist in enumerate(lists_of_list):
if sublist[0] == ID_to_search_for:
if lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
ok, here is my final answer. It does (again) all things you were searching for. Please debug your code in your IDE if there is a problem. You should start using context manager instead of open and closing files step by step.
I wrote the new code around your code in the question and added some comments to it.
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
m = int(oldLines[3])
print(m)
new_m = m+3
m1 = str(m)
new_m1 = str(new_m)
Phrase = "END_COMP_DATA"
#n = "Phrase not found" #not used --> not needed
with open(oldFileName,"r") as oldFile:
for number, lin in enumerate(oldFile):
if Phrase in lin:
n = number
#insert 3 lines to tryout_new at the right position (--> row n)
magnet = f" {m+1} "'" topo_magnet"'"\n"
iron = f" {m+2} "'" topo_iron"'"\n"
air = f" {m+3} "'" topo_air"'"\n"
oldLines[n:n] = [magnet, iron, air]
newFile = open(newFileName,"w")
flag = 0
with open('density.topo') as f:
data_density = [line.rstrip().split() for line in f]
for idx, row in enumerate(oldLines):
lst = row.rstrip() #I think you shouldn't name a variable like a class in python (list). use 'lst' or something like that
tmp_tryout = lst.split()
if row.find("BEG_ELEM_DATA") > -1:
flag = 1
if flag == 1 and len(tmp_tryout)>1:
# if the column has more than 2 columns (after split), check for the "10"
if tmp_tryout[1] == '10':
# density_idx_line searchs in density.topo for a match with tmp_tryout[0] (e.g. 3749) and stores the whole line
density_idx_line = list(filter(lambda x: x[0] == tmp_tryout[0], data_density))
if len(density_idx_line) >0:
if density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1e-05':
# the ' 10 ' is the 10 with a whitespace before and after it. Only like this only the 10 gets replaced (and not e.g. 3104 to 3184)
newFile.write(row.replace(' 10 ', ' 18 '))
elif density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1.0':
newFile.write(row.replace(' 10 ', ' 19 '))
else:
newFile.write(row.replace(' 10 ', ' 20 '))
else:
newFile.write(row)
else:
if idx == 3:
newFile.write(row.replace(m1, new_m1))
else:
newFile.write(row)
newFile.close()
print ("script terminated successfully!")
ok, here is another solution. For anybody else who reads this: this is still only a temporary solution but #Sagar and me both don't know to do it better.
import pandas as pd
df = pd.read_csv('tryout.hmo', header = 0, names = list('ABCDEFGHIJKLM'), delimiter=r'\s+', skiprows=[i for i in range(52362)])
df2 = pd.read_csv('Density.topo', header = 0, names = list('ANOP'), delimiter=r'\s+', skiprows=1)
df2 = df2.iloc[:-3, :]
df3 = df.merge(df2, how='outer', on='A')
df3[['O','P']] = df3[['O','P']].fillna(-1).astype(int).replace(-1, np.nan)
df3['B']= df3.apply(lambda x: 18 if x['B']==10 and x['O']==1 and x['P']==0 else (
19 if x['B']==10 and x['O']==1 and x['P']==1 else (
20 if x['B']==10 and x['O']==0 and x['P']==0 else x['B'])), axis=1)
df3.to_csv('new_tryout.csv')
It finished the code in less than a second, so it is far better than iterrows or itertuples.
The new csv file includes both the tryout file and the density file. They are merged together by the first column of tryout file (ID i guess)
I didn't check all of this very big file but from the few random points I checked, it seems as this way works.

OverflowError: Python int too large to convert to C long with openpyxl.load_workbook(filename, keep_vba=True)

for clid in ClientID:
filename = path_xlsx.replace('/', '\\') + '\\' + str(ClientID[clid])
# print(filename)
for i in range(len(csv)):
if str(csv.loc[i, 'ClientID']) == clid:
book = openpyxl.load_workbook(filename, keep_vba=True)
I have this error below :
Excel name (filename is :12149.xlsm)
a other same excel but with some modification inside it's working, only this one don't want to open
For example only with this code:
import openpyxl
global book
filename='C:/Users/victor/12149.xlsm'
book = openpyxl.load_workbook(filename, keep_vba=True)
I have the same mistake:
def from_excel(value, offset=CALENDAR_WINDOWS_1900):
if value is None:
return
if 1 < value < 60 and offset == CALENDAR_WINDOWS_1900:
value += 1
parts = list(jd2gcal(MJD_0, value + offset - MJD_0))
_, fraction = divmod(value, 1)
jumped = (parts[-1] == 0 and fraction > 0)
diff = datetime.timedelta(days=fraction)
if 0 < abs(value) < 1:
return days_to_time(diff)
if not jumped:
return datetime.datetime(*parts[:3]) + diff
else:
return datetime.datetime(*parts[:3] + [0])

Openpyxl won't save file

For some reason Openpyxl won't save the the xlsx file at the end of the program.
I am trying to read measurments from a file, each line is a different measurement. I want to take them and write them to excel as to make using this data later on easier. Everything seems to work, but in the end the data isn't saved, if i create new file where the changes should be saved it will not be created.
from openpyxl import load_workbook
from openpyxl import Workbook
wb = load_workbook(filename='Data_Base.xlsx')
sheet = wb.worksheets[0]
BS = []
Signal = []
with open('WifiData2.txt') as f:
for line in f:
y = int(line.split('|')[0].split(';')[3])
x = int(line.split('|')[0].split(';')[2])
floor = int(x = line.split('|')[0].split(';')[1])
data = line.split("|")[1].strip()
measurements = data.split(";")
for l in measurements:
raw = l.split(" ")
BSSID = raw[0]
signal_strength = raw[1]
print(signal_strength)
BS.append(BSSID)
Signal.append(signal_strength)
for row_num in range(sheet.max_row):
num = row_num
if row_num > 1:
test_X = int(sheet.cell(row=row_num, column=4).value)
test_Y = int(sheet.cell(row=row_num, column=3).value)
test_floor = int(sheet.cell(row=row_num, column=2).value)
if (test_X == x and test_Y == y and test_floor == floor):
nr = nr + 1
if (nr > 3):
q = 1
if (q == 0):
sheet.cell(row=sheet.max_row+1, column = 2, value = floor)
sheet.cell(row=sheet.max_row + 1, column=3, value=x)
sheet.cell(row=sheet.max_row + 1, column=4, value=y)
sheet.cell(row=sheet.max_row + 1, column=2, value=sheet.max_row)
for element in BS:
nr = 0
for col in sheet.max_column:
if BS[element] == sheet.cell(row=1, column=col).value:
sheet.cell(row=sheet.max_row + 1, column=col, value=Signal[element])
nr = 1
if (nr == 0):
sheet.cell(row=1, column=sheet.max_column+1, value=BS[element])
sheet.cell(row=sheet.max_row+1, column=sheet.max_column + 1, value=BS[element])
Signal.clear()
BS.clear()
wb.save('Data_Base1.xlsx')
What is weird that if i save the workbook earlier it will create the file. Of course it doesnt really work for me since any changes that i want made won't be made. I had similar issue when i tried it with xlrd/wt/utils combo. Does any1 know where the problem is ?
Use absolute path instead of relative path will do the trick!
Add
wb.template = False
before
wb.save('Filename.xlsx')

Not writing into an Excel file

I'm reading the data from one file named SPD_file. Matching the data with another file named Custom. And all the records which are matching in both the files will be written into the third file.
But it seems that something is wrong, because the code is matching the records and printing on console. But when I'm writing into another file nothing is coming into the new file, other than the header.
workbook = xlrd.open_workbook(SPD_file)
worksheets = workbook.sheet_names()
mapping_records = {}
for worksheet_name in worksheets:
worksheet = workbook.sheet_by_name(worksheet_name)
mapping_record = MappingRecord()
if worksheet_name == "CD":
for curr_row in range(0,worksheet.nrows):
mapping_record = worksheet.row(curr_row)
print worksheet_name
print mapping_record[0].value
for curr_row in mapping_record:
#print "In Loop...."
spd_record = MappingRecord()
spd_record.id = "00002269"
spd_record.erocode = None
spd_record.scno = None
mapping_records[mapping_record[8]] = spd_record
print "Read SPD File....."
custom_file_name = "Custom_" + today.strftime('%Y-%m-%d') + ".csv"
custom_file = ops_home + path + "\\" + custom_file_name
custom = open(custom_file, 'rb')
reader = csv.reader(custom, delimiter=',', quotechar='"')
for line in reader:
if mapping_records.has_key(mapping_record[8]):
spd_record = mapping_records[mapping_record[8]]
if line[7] == "ERO Code":
spd_record.erocode = line[8]
elif line[7] == "Service Number":
spd_record.scno = line[8]
#create a new file.
New_file = ops_home + '\\Reports\\SPD_new_' + today.strftime('%d%m%Y') + '.xlsx'
workbook = xlsxwriter.Workbook(New_file)
# Add a bold format to use to highlight cells.
bold = workbook.add_format({'bold': 1})
money = workbook.add_format({'num_format': '#,##0.00'})
worksheetCd = workbook.add_worksheet("CD")
cdHeader = ("Merchant ID", "EroCode", "Service Number")
cd_row = 0
cd_col = 0
for columnHeader in cdHeader:
worksheetCd.write(cd_row, cd_col, columnHeader,bold)
cd_col += 1
for ctx in mapping_records:
spd_record = mapping_records[ctx]
if spd_record.payment_mode == "CRD":
cd_row += 1
cd_col = 0
cdRow = (spd_record.id, spd_record.erocode, spd_record.scno)
for columnData in cdRow:
if cd_col == 5 or cd_col == 19 or cd_col ==20 or cd_col ==21:
worksheetCd.write_number(cd_row, cd_col, columnData, money)
else:
worksheetCd.write(cd_row, cd_col, columnData)
cd_col += 1
workbook.close()

Parsing a big text file, extract data & store it in a CSV file.. Too Slow

I have a big log file (say 1-3 Gb) which I need to parse, extract data & save it in a CSV file.
Text File Data
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
* D:40027C5C rd-byte 00 *core0\Global\Ypf_OILL_OilLvlOn 20.342us
* D:40010044 rd-word 0FE2 *l\u2SAD_OILLVS_RecoveryCounter 0.160us
* D:40010044 wr-word 0FE1 *l\u2SAD_OILLVS_RecoveryCounter 0.040us
* D:40035FC8 wr-long 00000008 \\core0\Global\u4TimeHiCnt 1.000us
I have to extract the variable name which is after the last \ and then the number of Read & Write along with the datatype & store it in a CSV file.
CSV File Result
Variable Datatype CORE 0 CORE 1 CORE X
Read Write Read Write Read Write
OS_inKernel byte 0 0 111768 111878 0 0
OS_globalIntLevel long 0 0 281604 237901 0 0
The problem is it takes too much time. Can you pls look in to the attached code & suggest ways to make it faster.
import string
import sys
import time
MyFile = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
GeneratedFile = open(str(("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\")+'ParsedOutput.csv'),'w')
try:
MyVariableList = []
TimeStartTest = time.time() #Starting Time
GeneratedFile.write('\nVariable')
GeneratedFile.write(', Datatype')
GeneratedFile.write(', CORE 0')
GeneratedFile.write(',, CORE 1')
GeneratedFile.write(',, CORE X')
GeneratedFile.write('\n,, Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write(', Read ')
GeneratedFile.write(', Write ')
GeneratedFile.write('\n')
for CurrentLine in MyFile:
NoofSpaces = 0
if CurrentLine.find('\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('\\')+1:].split(' ')[0]
elif CurrentLine.find('*\\') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*\\')+1:].split(' ')[0]
elif CurrentLine.find('*') != -1:
MyVariable = CurrentLine[CurrentLine.rfind('*')+1:].split(' ')[0]
VariableFound = 0
MyVariableList.sort()
Lowerbound = 0
Upperbound = len(MyVariableList)-1
while Lowerbound <= Upperbound and VariableFound == 0:
middle_pos = (Lowerbound+Upperbound) // 2
if MyVariableList[middle_pos] < MyVariable:
Lowerbound = middle_pos + 1
elif MyVariableList[middle_pos] > MyVariable:
Upperbound = middle_pos - 1
else:
VariableFound = 1
if VariableFound == 0:
MyVariableList.append(MyVariable)
try:
MyFile1 = open("C:\\Users\\AEC_FULL\\Saravanan\\Workspace\\Trace32Log_Parser\\core1_sram_ReadWrite.txt")#core0_sram_ReadWrite_rawdata
Core0_ReadCount = 0
Core0_WriteCount = 0
Core1_ReadCount = 0
Core1_WriteCount = 0
CoreX_ReadCount = 0
CoreX_WriteCount = 0
for CurrentLine1 in MyFile1:
if CurrentLine1.find(MyVariable) != -1:
## CORE 0 ##
if CurrentLine1.find("0\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core0_ReadCount = Core0_ReadCount + 1
elif DataOperation == 'wr':
Core0_WriteCount = Core0_WriteCount + 1
## CORE 1 ##
elif CurrentLine1.find("1\\Global") != -1:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
Core1_ReadCount = Core1_ReadCount + 1
elif DataOperation == 'wr':
Core1_WriteCount = Core1_WriteCount + 1
## CORE X ##
else:
DataType = CurrentLine1.split(' ')[0].split('-')[1]
DataOperation = CurrentLine1.split(' ')[0].split('-')[0].split(' ')[-1]
if DataOperation == 'rd':
CoreX_ReadCount = CoreX_ReadCount + 1
elif DataOperation == 'wr':
CoreX_WriteCount = CoreX_WriteCount + 1
GeneratedFile.write('\n %s' %MyVariable)
GeneratedFile.write(', %s' %DataType)
GeneratedFile.write(', %d' %Core0_ReadCount)
GeneratedFile.write(', %d' %Core0_WriteCount)
GeneratedFile.write(', %d' %Core1_ReadCount)
GeneratedFile.write(', %d' %Core1_WriteCount)
GeneratedFile.write(', %d' %CoreX_ReadCount)
GeneratedFile.write(', %d' %CoreX_WriteCount)
GeneratedFile.write('\n')
finally:
MyFile1.close()
except:
print sys.exc_info()
finally:
GeneratedFile.close()
MyFile.close()
TimeStopTest = time.time()
print str(int((TimeStopTest - TimeStartTest)/60))
You'd better use with statement, like this:
# if this file is line based
with open('test.txt') as f:
for line in f:
# process line, do something with line

Categories