I'm reading the data from one file named SPD_file. Matching the data with another file named Custom. And all the records which are matching in both the files will be written into the third file.
But it seems that something is wrong, because the code is matching the records and printing on console. But when I'm writing into another file nothing is coming into the new file, other than the header.
workbook = xlrd.open_workbook(SPD_file)
worksheets = workbook.sheet_names()
mapping_records = {}
for worksheet_name in worksheets:
worksheet = workbook.sheet_by_name(worksheet_name)
mapping_record = MappingRecord()
if worksheet_name == "CD":
for curr_row in range(0,worksheet.nrows):
mapping_record = worksheet.row(curr_row)
print worksheet_name
print mapping_record[0].value
for curr_row in mapping_record:
#print "In Loop...."
spd_record = MappingRecord()
spd_record.id = "00002269"
spd_record.erocode = None
spd_record.scno = None
mapping_records[mapping_record[8]] = spd_record
print "Read SPD File....."
custom_file_name = "Custom_" + today.strftime('%Y-%m-%d') + ".csv"
custom_file = ops_home + path + "\\" + custom_file_name
custom = open(custom_file, 'rb')
reader = csv.reader(custom, delimiter=',', quotechar='"')
for line in reader:
if mapping_records.has_key(mapping_record[8]):
spd_record = mapping_records[mapping_record[8]]
if line[7] == "ERO Code":
spd_record.erocode = line[8]
elif line[7] == "Service Number":
spd_record.scno = line[8]
#create a new file.
New_file = ops_home + '\\Reports\\SPD_new_' + today.strftime('%d%m%Y') + '.xlsx'
workbook = xlsxwriter.Workbook(New_file)
# Add a bold format to use to highlight cells.
bold = workbook.add_format({'bold': 1})
money = workbook.add_format({'num_format': '#,##0.00'})
worksheetCd = workbook.add_worksheet("CD")
cdHeader = ("Merchant ID", "EroCode", "Service Number")
cd_row = 0
cd_col = 0
for columnHeader in cdHeader:
worksheetCd.write(cd_row, cd_col, columnHeader,bold)
cd_col += 1
for ctx in mapping_records:
spd_record = mapping_records[ctx]
if spd_record.payment_mode == "CRD":
cd_row += 1
cd_col = 0
cdRow = (spd_record.id, spd_record.erocode, spd_record.scno)
for columnData in cdRow:
if cd_col == 5 or cd_col == 19 or cd_col ==20 or cd_col ==21:
worksheetCd.write_number(cd_row, cd_col, columnData, money)
else:
worksheetCd.write(cd_row, cd_col, columnData)
cd_col += 1
workbook.close()
Related
i have a problem with my script.
I`ve made a script that fetch some datas from lines of a raw.txt file into columns to excel.
Its worked at the beggining but now when i added more datas in the file its not, if you can help me or have another solve.
**This is my script:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
import pandas as pd
import time
def clearcasevobs():
pathdest = path_dir()
dest = createdir()
timestr = time.strftime("%Y-%m-%d")
txtName = rf"{pathdest}\{timestr}-clearcaseRawData-vobsDetails.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
# header
workSheet.write(0, 0, "Tag")
workSheet.write(0, 1, "CreateDate")
workSheet.write(0, 2, "Created By")
workSheet.write(0, 3, "Storage Host Pathname")
workSheet.write(0, 4, "Storage Global Pathname")
workSheet.write(0, 5, "DB Schema Version")
workSheet.write(0, 6, "Mod_by_rem_user")
workSheet.write(0, 7, "Atomic Checkin")
workSheet.write(0, 8, "Owner user")
workSheet.write(0, 9, "Owner Group")
workSheet.write(0, 10, "ACLs enabled")
workSheet.write(0, 11, "FeatureLevel")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "versioned":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("created") >= 0:
entries += 1
txt = arr
s = txt.index("created")
e = txt.index("by")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
txt2 = txt[e + 3:]
fullentry.append(txt2)
elif arr.find("VOB storage host:pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("VOB storage global pathname") >= 0:
entries += 1
txt = arr
s = txt.index('"')
e = txt.index('"', s + 1)
txt = txt[s + 1:e]
fullentry.append(txt)
elif arr.find("database schema version:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("modification by remote privileged user:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("tomic checkin:") >= 0:
entries += 1
txt = arr
s = txt.index(':')
txt = txt[s + 2:]
fullentry.append(txt)
elif arr.find("owner ") >= 0:
entries += 1
txt = arr
s = txt.index('owner')
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("group tmn") >= 0:
if arr.find("tmn/root") == -1:
entries += 1
txt = arr
s = txt.index('group')
entries += 1
txt = txt[s + 5:]
fullentry.append(txt)
elif arr.find("ACLs enabled:") >= 0:
entries += 1
txt = arr
txt = txt[-2:]
fullentry.append(txt)
elif arr.find("FeatureLevel =") >= 0:
entries += 1
txt = arr
txt = txt[-1:]
fullentry.append(txt)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseReport.xls"))
fp.close()
This is my file.txt datas(the file that script need to work and doesnt):
https://paste.pythondiscord.com/sedowagigo
This is how should output as excel file::
Details:
-The script that i did basically should read the datas inside the .txt file and based on the keywords that i put to create the columns and add the wanted datas in the right columns, but also should ignore the sh*t/raw datas dat dont need to be processed.
-First time was working thats why i have also the output photo .xls, but now its not working anymore because i added more datas inside and have more junkies... If someone can help me or you know other method im open to all
This is the old .txt file that i tested the script and works: https://paste.pythondiscord.com/ohewatahuv
This is the error that i received when i use the script on the new file that i attach inside the pastebin at the beggining ( https://paste.pythondiscord.com/sedowagigo ):
Ty for help!
{
In excel sheet I have Expected values and actual values columns
I want to split numerical and character values(character values are units like ps, ns..etc I need to append on one column) and I want to do expected - actual values and add in new column and units in one column
}
My excel and output should be D and e column. We need to do Column B- Column C and units need to add in E
A B C D E
Scenario Expected Actual Deviation Units
AOP 102ps 100ps 2 ps
COD 113GBd 110GBd 3 GBd
EFG 99Gbps 98Gbps 1 Gbps
my code
import openpyxl
import re
s = '10km'
k=re.findall(r'[A-Za-z]+|\d+', s)
print(k)
You can use these two functions to get the text out of your input.
For the real numbers (float):
def get_real_number(input_string):
string_length = len(str(input_string))
temp_char_list = []
output_string = ''
for i in range(0, string_length):
if not str(input_string[i]).isdigit():
temp_char_list.append(input_string[i])
if "." in temp_char_list:
temp_char_list.remove(".")
if len(temp_char_list) > 0 and temp_char_list[0] == "-":
temp_char_list.remove("-")
if "e" in temp_char_list and "-" in temp_char_list:
temp_char_list.remove("e")
temp_char_list.remove("-")
output_string = input_string
for i in range(0, string_length):
if input_string[i] in temp_char_list:
output_string = output_string.replace(input_string[i], "")
return float(output_string)
For the Unit of Measure:
def get_uom(input_string):
string_length = len(str(input_string))
output_string = ''
for i in range(0, string_length):
if not str(input_string[i]).isdigit():
output_string = str(output_string) + str(input_string[i])
output_string = output_string.replace(".", "")
if "e-" in output_string:
output_string = output_string.replace("e-", "")
output_string = output_string.replace("-", "")
return output_string
To make use of the two functions and openpyxl to do what you want, put the two functions in the same py file or import the py file you like to store them in, then use the following code.
import openpyxl
from openpyxl.styles import Alignment
def main():
path = 'c:/temp/Pam - Copy.xlsx'
wb = openpyxl.load_workbook(path, data_only=True)
ws = wb['Sheet1']
end_range = len(ws['A'])
cell_value_b_filter = list(("----------------", "Disabled", "Diable", "*", "---", "Null", "Expected"))
cell_value_c_filter = list(("----------------", "Disabled", "Diasable", "---", "Null", "Actual"))
for curr_row in range(2, end_range + 1):
cell_value_a = str(ws.cell(row=curr_row, column=1).value)
cell_value_b = str(ws.cell(row=curr_row, column=2).value)
cell_value_c = str(ws.cell(row=curr_row, column=3).value)
if cell_value_a == "Output File Name":
continue
if cell_value_b in cell_value_b_filter or cell_value_c in cell_value_c_filter:
continue
if len(cell_value_b) == 0 or len(cell_value_c) == 0:
continue
if get_uom(cell_value_b) != get_uom(cell_value_c):
continue
cell_value_d = float(get_real_number(cell_value_b)) - float(get_real_number(cell_value_c))
cell_value_e = get_uom(cell_value_b)
ws.cell(row=curr_row, column=4).value = cell_value_d
ws.cell(row=curr_row, column=4).alignment = Alignment(horizontal='center')
ws.cell(row=curr_row, column=5).value = cell_value_e
wb.save(path)
wb.close()
If you are not family with python, you can stack the 3 functions all together like this.
import openpyxl
from openpyxl.styles import Alignment
def get_real_number(input_string):
string_length = len(str(input_string))
temp_char_list = []
output_string = ''
for i in range(0, string_length):
if not str(input_string[i]).isdigit():
temp_char_list.append(input_string[i])
if "." in temp_char_list:
temp_char_list.remove(".")
if len(temp_char_list) > 0 and temp_char_list[0] == "-":
temp_char_list.remove("-")
if "e" in temp_char_list and "-" in temp_char_list:
temp_char_list.remove("e")
temp_char_list.remove("-")
output_string = input_string
for i in range(0, string_length):
if input_string[i] in temp_char_list:
output_string = output_string.replace(input_string[i], "")
return float(output_string)
def get_uom(input_string):
string_length = len(str(input_string))
output_string = ''
for i in range(0, string_length):
if not str(input_string[i]).isdigit():
output_string = str(output_string) + str(input_string[i])
output_string = output_string.replace(".", "")
if "e-" in output_string:
output_string = output_string.replace("e-", "")
output_string = output_string.replace("-", "")
return output_string
def main():
path = 'c:/temp/Pam - Copy.xlsx'
wb = openpyxl.load_workbook(path, data_only=True)
ws = wb['Sheet1']
end_range = len(ws['A'])
cell_value_b_filter = list(("----------------", "Disabled", "Diable", "*", "---", "Null", "Expected"))
cell_value_c_filter = list(("----------------", "Disabled", "Diasable", "---", "Null", "Actual"))
for curr_row in range(2, end_range + 1):
cell_value_a = str(ws.cell(row=curr_row, column=1).value)
cell_value_b = str(ws.cell(row=curr_row, column=2).value)
cell_value_c = str(ws.cell(row=curr_row, column=3).value)
if cell_value_a == "Output File Name":
continue
if cell_value_b in cell_value_b_filter or cell_value_c in cell_value_c_filter:
continue
if len(cell_value_b) == 0 or len(cell_value_c) == 0:
continue
if get_uom(cell_value_b) != get_uom(cell_value_c):
continue
cell_value_d = float(get_real_number(cell_value_b)) - float(get_real_number(cell_value_c))
cell_value_e = get_uom(cell_value_b)
ws.cell(row=curr_row, column=4).value = cell_value_d
ws.cell(row=curr_row, column=4).alignment = Alignment(horizontal='center')
ws.cell(row=curr_row, column=5).value = cell_value_e
wb.save(path)
wb.close()
if __name__ == "__main__":
main()
I have a code to create a CSV with information from another CSV file. In my new CSV file, I would like to save only 20 rows sorted from highest to lowest of row ['impressions']
I read something about pandas but I don't find anything about how to do it!
To be more clear, I shared some images:
before:
enter image description here
after:
enter image description here
Code:
import csv
input_file = 'report_2017_12_11_12_31_19UTC.csv'
output_file= "All_Data_Tags.csv"
with open(input_file) as csvfile, open(output_file, "w") as output:
reader = csv.DictReader(csvfile)
cols = ("domain","ddomain","opportunities", "impressions", "fillRate", "DATA")
writer = csv.DictWriter(output, fieldnames=cols, extrasaction='ignore')
writer.writeheader()
for row in reader:
row['fillRate'] = '{:.2f}'.format(float(row['fillRate']) * 100)
if row['ddomain'] == "":
if row['domain'] == "":
row['ddomain'] = "App"
row['domain'] = " "
if row['domain'] == row['ddomain']:
row['domain'] = "Real Site"
if row['domain'] == "":
row['domain'] = "Detected Only"
if row['ddomain'] == "":
row['ddomain'] = "Vast Media"
if row['ddomain'] != row['domain']:
if row['ddomain'] != "Vast Media":
if row['domain'] != "Real Site":
if row['domain'] != "Detected Only":
if row['ddomain'] != "App":
row['DATA'] = "FAKE"
else:
row['DATA'] = "OK"
else:
row['DATA'] = "OK"
else:
row['DATA'] = "OK"
else:
row['DATA'] = "OK"
writer.writerow(row)
Here is the Answer:
code:
import pandas as pd
movies = pd.read_csv('Top20_Media_Yesterday.csv')
movies = movies.sort_values(['impressions'], ascending=False)
movies = movies.to_csv("Top20_Media_Yesterday.csv")
movies = pd.read_csv('Top20_Media_Yesterday.csv', nrows=21)
movies = movies.to_csv("Top20_Media_Yesterday.csv")
Use the DataFrame.sort_values function of the pandas framework, passing the column name(s),you wish to sort, to the by argument and setting axis to 1.
You can find similar examples here.
I'm running a piece of freely available python code used to detect CNVs in single cell sequencing data:
#!/usr/bin/env python
import sys
def main():
infilename = sys.argv[1]
outfilename = sys.argv[2]
statfilename = sys.argv[3]
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
bins = ("/path/hg19.bin.boundaries.50k.bowtie.k50.sorted.txt", 0)
INFILE = open(infilename, "r")
OUTFILE = open(outfilename, "w")
STATFILE = open(statfilename, "w")
binCounts = []
for i in range(len(bins)):
binCounts.append(0)
print len(binCounts)
print len(bins)
counter = 0
totalReads = 0
prevChrompos = ""
for x in INFILE:
arow = x.rstrip().split("\t")
thisChrom = arow[2]
thisChrompos = arow[3]
if thisChrom.find("_") > -1:
#print thisChrom
continue
if thisChrom == "chrM":
#print thisChrom
continue
if thisChrom == "":
continue
if chrominfo.has_key(thisChrom):
pass
else:
continue
totalReads += 1
thisChrominfo = chrominfo[thisChrom]
thisAbspos = long(thisChrompos) + long(thisChrominfo[2])
counter += 1
indexUp = len(bins) - 1
indexDown = 0
indexMid = int((indexUp - indexDown) / 2.0)
while True:
if thisAbspos >= long(bins[indexMid][2]):
indexDown = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexMid
else:
indexUp = indexMid + 0
indexMid = int((indexUp - indexDown) / 2.0) + indexDown
if indexUp - indexDown < 2:
break
binCounts[indexDown] += 1
prevChrompos = thisChrompos
for i in range(len(binCounts)):
thisRatio = float(binCounts[i]) / (float(counter) / float(len(bins)))
OUTFILE.write("\t".join(bins[i][0:3]))
OUTFILE.write("\t")
OUTFILE.write(str(binCounts[i]))
OUTFILE.write("\t")
OUTFILE.write(str(thisRatio))
OUTFILE.write("\n")
binCounts.sort()
STATFILE.write("TotalReads\tMedianBinCount\n")
STATFILE.write(str(totalReads))
STATFILE.write("\t")
STATFILE.write(str(binCounts[len(bins)/2]))
STATFILE.write("\n")
INFILE.close()
OUTFILE.close()
STATFILE.close()
def fileToDictionary(inputFile, indexColumn):
input = open(inputFile, "r")
rd = dict()
# input.readline()
for x in input:
arow = x.rstrip().split("\t")
id = arow[indexColumn]
if rd.has_key(id):
#rd[id].append(arow)
print "duplicate knowngene id = " + id
print "arow = " + str(arow)
print "rd[id] = " + str(rd[id])
else:
rd[id] = arow
input.close()
return(rd)
def fileToArray(inputFile, skipFirst):
input = open(inputFile, "r")
ra = []
for i in range(skipFirst):
input.readline()
for x in input:
arow = x.rstrip().split("\t")
ra.append(arow)
input.close()
return(ra)
if __name__ == "__main__":
main()
I'm getting an error on line 40:
Traceback (most recent call last):
File "/path/varbin.50k.sam.py", line 129, in <module>
main()
File "/path/varbin.50k.sam.py", line 40, in main
**if chrominfo.has_key(thisChrom):
AttributeError: 'tuple' object has no attribute 'has_key'**
I don't work regularly in Python, can someone offer a suggestion?
Where do I begin?
Your code is expecting a dictionary and getting a tuple. I think you've missed a step: You need to change
chrominfo = ("/path/hg19.chrom.sizes.txt", 0)
To
chrominfo = fileToDictionary("/path/hg19.chrom.sizes.txt", 0)
Note also that if dict.has_key(key) has been deprecated in favour of if key in dict.keys()
i have a directory with 5+ invalid CSV files. i have no problems reading the files and then writing them as "good" CSV files one at a time. But when i try to process a second file i get "IndexError: array index out of range"
import xlrd
import csv, sys, os
import datetime, time
import logging
import Gmail_email
program = "CleanCSV"
date = datetime.datetime(1899, 12, 30)
argv0=""
argv1 = 'c:/tmp/checkEmail/' #input directory
argv2 = "f:/foo/in/bar-" #output directory
sys.argv = [argv0, argv1, argv2]
inDir = sys.argv[1]#input directory
outDir = sys.argv[2] #output directory
lList = [] #holder list to hold names of files to be processed
def processFiles():
try: #Makes list of local files in lDir, Populates lList
if os.listdir(inDir) == []: #checks for files in lDir
logging.info('No Files to upload')
exit()
else:
for file_name in os.listdir(inDir):
#print file_name
if os.path.isfile(inDir+file_name):
lList.append(file_name) # populate local dir list
if 'Thumbs.db' in lList: #remove windows thumbs file
lList.remove('Thumbs.db')
logging.info('Files to be checked')
logging.info('%s', lList )
#print lList, 'lList'
except Exception, e:
Gmail_email.email(e, program)
logging.warning('Error with local files')
logging.warning('%s', e)
exit()
for each in lList: #calls on cleanup method for each file in lLIst
filePath= inDir+each
print filePath, "filepath"
testFile(filePath)
def testFile(filePath):
try:
with open(filePath, "rb") as csvfile:
spamreader= csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
#print "good file, most likely"
pass
except Exception, e:
logging.warning('Error with local files')
logging.warning('%s', e)
#print "cleaing bad file", filePath
cleanBadFile(filePath)
def cleanBadFile(filePath):
timestr = time.strftime("%Y%m%d-%H%M%S")
#print "bad file trying to clean"
f = open(outDir+timestr+".csv", 'ab')
try: #can i read the file
workbook = xlrd.open_workbook(filePath)
#will error here if bad xlrd cannot open it
print workbook.sheet_names()
#print workbook
except Exception, e:
#print e, " error"
pass
worksheet = workbook.sheet_by_name('Sheet')
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
#print worksheet.ncols, 'num cells'
curr_row = -1
while curr_row < num_rows: #goes over every row
num_cells = worksheet.ncols - 1
curr_row += 1
row = worksheet.row(curr_row)
print row, "row"
curr_cell = -1
print worksheet.row_len(curr_row), "row len"
print curr_row, curr_cell, "curr row, curr cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print ' ', cell_type, ':', cell_value
values= []
if cell_type == 0: #tests if first value in row is data
#assuming that good rows will have a value in the first cell of each row
#if no data row is not copied to new file
print "bad line"
pass
else:
while curr_cell < num_cells:
curr_cell += 1
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
#print cell_type, ":", cell_value
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"
values.append(cell_value)
#print values, "values"
csv.writer(f, delimiter=',',
quotechar=',', quoting=csv.QUOTE_MINIMAL).writerow( values )
f.close()
print f.closed
print "ah"
curr_cell= 0
curr_row = 0
#print "checking file:", readFile
processFiles()
#print "exit"
exit
The error messsage
Traceback (most recent call last):
File "F:\cleanCSV.py", line 132, in <module>
processFiles()
File "F:\cleanCSV.py", line 51, in processFiles
testFile(filePath)
File "F:\cleanCSV.py", line 64, in testFile
cleanBadFile(filePath)
File "F:\cleanCSV.py", line 106, in cleanBadFile
cell_type = worksheet.cell_type(curr_row, curr_cell)
File "C:\Python27\lib\site-packages\xlrd\sheet.py", line 413, in cell_type
return self._cell_types[rowx][colx]
IndexError: array index out of range
I feel like I need to "reset" a counting variable to but think i have them all. I don't know what to do.
Two lines before the line causing the exception curr_cell is set to -1 which can't be a valid cell index. A comment some lines further down suggests you expect that to be the first cell in the row, so the index should be 0 instead of -1.
I moved my +1 (curr_cell+=1) down 3 lines.
while curr_cell < num_cells:
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
#print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print cell_type, ":", cell_value
curr_cell += 1
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"