I have not used Python in years and trying to get back into it. I have a Input_file (.csv) that I want to parse and store the output in a output.csv or .txt
I have managed to parse the .csv file using this code, and for the most part the it works but I cant get it save to save to file (Issue 1) without getting the below error (error 1)
import csv
import re
import itertools
file_name = 'PhoneCallData1.txt'
try:
lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))
count =0
except:
print('File cannot be opened:',file_name)
exit()
try:
fout = open('output.txt','w')
except:
Print("File cannot be written to:","OutputFile")
exit()
d = dict()
for item in itertools.chain(lol): # Lists all items (field) in the CSV file.
count +=1 # counter to keep track of row im looping through
if lol[count][3] is None:
print("value is not blank")
count +=1
else:
try:
check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) # check to determine if date is a date
except:
continue
check_cost = re.compile(r'($+\d*)', lol[count][9]) # check to determine if value is a cost
if check_date ==TRUE:
try:
key =lol[count][3] # If is a date value, store key
except ValueError:
continue
if check_cost==TRUE:
value = lol[count][9] # if is a cost ($) store value
d[key] = value
print (d[key])
# fout.write((d[key])
# What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charge
and now comes the complicated part. The file I need to parse has a number of irrelevant rows of data before and in between the required data.
Data Format
What I want to do;
I want to iterate over two columns of data, and only store the rows that have a date or cost in them, dis-guarding the rest of the data.
import csv
import re
import itertools
lol = list(csv.reader(open('PhoneCallData1.txt', 'r'), delimiter=' '))
count =0
d = dict()
for item in itertools.chain(lol): #Lists all items (field) in the CSV file.
count +=1 # counter to keep track of row im looping through
check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) #check to determine
check_cost = re.compile(r'($+\d*)', lol[count][9]) #check to determine if value is a cost
if check_date ==TRUE:
key =lol[count][3] #If is a date value, store key
if check_cost==TRUE:
value = lol[count][9] #if is a cost ($) store value
d[key] = value
print (d[key])
#What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charges
What I have tried;
I tried to index the data after I loaded it, but that didn't seem to work.
I created this to only look at rows at that were more than a certain length, but its terrible code. I was hoping for something more practical and reusable.
import re
with open('PhoneCallData1.txt','r') as f, open('sample_output.txt','w') as fnew:
for line in f:
if len(line) > 50:
print(line)
fnew.write(line + '\n')
Import csv
lol = list(csv.reader(open('PhoneCallData1.txt', 'rb'), delimiter='\t'))
#d = dict()
#key = lol[5][0] # cell A7
#value = lol[5][3] # cell D7
#d[key] = value # add the entry to the dictionary
Keep getting index out of bounds errors
import re
import csv
match=re.search(r'(\d+/\d+/\d+)','testing date 11/12/2017')
print match.group(1)
Trying to use regex to search for the date in the first column of data.
NOTE: I wanted to try Pandas but I feel I need to start here. Any Help would be awesome.
answer to if next record need to be parsed must be specific, and I have answer a similar question, in the same way, finite-state machine may help
main code is:
state = 'init'
output = []
# for line loop:
if state == 'init': # seek for start parsing
# check if start parsing
state = 'start'
elif state == 'start': # start parsing now
# parsing
# check if need to end parsing
state = 'init'
import csv
import re
import itertools
import timeit
start_time = timeit.default_timer()
# code you want to evaluate
file_name = 'PhoneCallData.txt'
try:
lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))
except:
print('File cannot be opened:', file_name)
exit()
try:
fout = open('output.txt','w')
except:
Print("File cannot be written to:","OutputFile")
exit()
# I could assign key value pairs and store in dictionry. Then print, search,ect on the dictionary. Version2
# d = dict()
count =0
total = 0
for row in lol: # Lists all items (field) in the CSV file.
#print(len(row))
count +=1 # counter to keep track of row im looping through
if len(row) == 8:
if row[2].isdigit():
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[7])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + (row[7]) + " : " + str(total) + "\n")
print (string)
fout.write(string)
if len(row) == 9:
if row[2].isdigit():
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[8])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + row[8] + " : " + str(total) + "\n")
print(string)
fout.write(string)
if len(row) == 10:
# print (row[2] +":"+ row[9])
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[9])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + row[9] + " : " + str(total) + "\n")
print(string)
fout.write(string)
# Convert to string so I can print and store in file
count_string = str(count)
total_string = str(total)
total_string.split('.', 2)
# Write to screen
print (total_string + " Total\n")
print("Rows parsed :" + count_string)
# write to file
fout.write(count_string + " Rows were parsed\n")
fout.write(total_string + " Total")
# Calcualte time spent on task
elapsed = timeit.default_timer() - start_time
round_elapsed = round(elapsed, 2)
string_elapsed = str(round_elapsed)
fout.write(string_elapsed)
print(string_elapsed + " seconds")
fout.close()
Related
I tried a lot now, and also read a lot the past days, but I can't come up with a correct solution to my problem. Maybe someone can give me helping hand.
I have to CSV Files where wdw_clip_db_2018-01-17_2 (4720 lines) should have all data and wdw_content_complete (2752 lines) only a subset.
wdw_clip_db_2018-01-17_2.csv:
11,0_7cjgob0v,9000301_AzubiGlueckT1.mxf,0_7cjgob0v.mpg
43,0_heor15yl,,0_heor15yl.mpg
1616,0_dfopff5t,578_Bier.MXF,0_dfopff5t.mpg
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
1931,0_cbx3zgw6,9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6.mpg
wdw_content_complete.csv:
1737,9000301_AzubiGlueckT1.mxf,0_7cjgob0v
1451,578_Bier.MXF,0_dfopff5t
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
1762,9070201_KeinGeldFuerGeschen.mxf,NULL
What I need to come up with is are the following csv files readable by Excel:
wdw_clean_assets.csv:
9000301_AzubiGlueckT1.mxf,0_7cjgob0v
578_Bier.MXF,0_dfopff5t
Where wdw_clean_assets holds every line which matches the file and the external_refernce (e.g. 0_7cjgob0v).
wdw_to_add_ext_refs.csv:
9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6
Where wdw_to_add_ext_refs holds every line which matches the file but has a NULL in the external_reference field. The NULL is replaced with the external reference found in wdw_clip_db_2018-01-17_2.csv.
When I compare the number of lines, there seam to be lines in wdw_content_complete.csv which are not in wdw_clip_db_2018-01-17_2.csv. To be honest this shouldn't be so I need to find out what's wrong with these lines. Therefore I need to put the Rest of wdw_content_complete.csv in a new CSV file.
wdw_to_clean_assets.csv:
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
And finally I need the Rest of both CSVs wdw_clip_db_2018-01-17_2.csv and wdw_content_complete.csv in two separate CSVs. Therefore I tried to somehow substract one list from another which unfortunately also doesn't work correct.
wdw_hansi_assets_rest.csv:
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
wdw_mediahub_assets_rest.csv:
1433,445_Holzverarbeitung.MXF,NULL
What I got so far is this Python Script:
import csv
# CSV Files
# wdw_clip_db_2018-01-17_2.csv
# wdw_content_complete.csv
# Reading the CSV Files
hansi_assets = []
with open('wdw_clip_db_2018-01-17_2.csv') as hansi_db:
reader = csv.reader(hansi_db)
for row in reader:
hansi_assets.append(row)
hansi_db.close()
mediahub_assets = []
with open('wdw_content_complete.csv') as mediahub_db:
reader = csv.reader(mediahub_db)
for row in reader:
mediahub_assets.append(row)
mediahub_db.close()
clean_asset = []
clean_assets = []
to_add_ext_ref = []
to_add_ext_refs = []
to_clean_assets = []
hansi_assets_rest = []
mediahub_assets_rest = []
hansi_assets_rm = []
mediahub_assets_rm = []
num_clean_rwos = 0
num_to_clean_rows = 0
num_to_add_ext_refs = 0
num_dirty_rows = 0
num_hansi_iterations = 0
num_mediahub_iterations = 0
num_mediahub_null = 0
num_hansi_mediahub_matches = 0
# Looping over the CSV Files
for hansi_asset in hansi_assets:
num_hansi_iterations += 1
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
# Checking if there are similar, clean entries
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and hansi_asset[1] == mediahub_asset[2]:
clean_assets.append(mediahub_asset)
# Counting for evaluation reasons
num_clean_rwos += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries which miss the Ext_Ref field and replacing the NULL by the corresponding Ext_Ref in the hansi_asset
elif hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and mediahub_asset[2] == "NULL":
to_add_ext_ref = [mediahub_asset[1], hansi_asset[1]]
to_add_ext_refs.append(to_add_ext_ref)
# Counting for evaluation reasons
num_to_add_ext_refs += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries that don't match
elif hansi_asset[2] != mediahub_asset[1] or hansi_asset[3] != mediahub_asset[1]:
to_clean_assets.append([mediahub_asset[1], mediahub_asset[2]])
# Counting for evaluation reasons
num_to_clean_rows += 1
# Creating a list to substract from its origin to get the Rest
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Just counting the Matches
for hansi_asset in hansi_assets:
for mediahub_asset in mediahub_assets:
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1]:
num_hansi_mediahub_matches += 1
# Just counting the NULLs
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
if mediahub_asset[2] == "NULL":
num_mediahub_null += 1
# for mediahub_asset_rm in mediahub_assets_rm:
# if mediahub_asset[1] != mediahub_asset_rm[1]:
# mediahub_assets_rest = Diff(mediahub_assets, mediahub_assets_rm)
# Trying to substract medihub_assets_rm from mediahub_assets to get the Rest
mediahub_assets_rest = [item for item in mediahub_assets_rm if item not in mediahub_assets]
hansi_assets_rest = [item for item in hansi_assets_rm if item not in hansi_assets]
# Printing some lines for evaluation
print hansi_assets[1]
print mediahub_assets[1]
print clean_assets[1]
print to_clean_assets[1]
print to_add_ext_refs[1]
print hansi_assets_rest[1]
print mediahub_assets_rest[1]
print hansi_assets_rm[1]
print mediahub_assets_rm[1]
print "Num Hansi Assets: " + str(len(hansi_assets))
print "Num Mediahub Assets: " + str(len(mediahub_assets))
print "Num Clean Assets: " + str(len(clean_assets))
print "Num Hansi Assets to remove: " + str(len(hansi_assets_rm))
print "Num Mediahub Assets to remove: " + str(len(mediahub_assets_rm))
print "Num Hansi Rest Assets: " + str(len(hansi_assets_rest))
print "Num Mediahub Rest Assets: " + str(len(mediahub_assets_rest))
print "Num Mediahub NULLs: " + str(num_mediahub_null)
print "Num Hansi Mediahub Matches: " + str(num_hansi_mediahub_matches)
print "Num Clean Rows: " + str(num_clean_rwos)
print "Num To Clean Rows: " + str(num_to_clean_rows)
print "Num To Add Ext_Ref: " + str(num_to_add_ext_refs)
print "Num Dirty Rows: " + str(num_dirty_rows)
print "Num Hansi Iterations: " + str(num_hansi_iterations)
print "Num Mediahub Iterations: " + str(num_mediahub_iterations / num_hansi_iterations)
# Writing clean_assets to a file
wdw_clean_assets = []
with open('wdw_clean_assets.csv', 'w') as wdw_clean_assets:
writer = csv.writer(wdw_clean_assets)
for row in clean_assets:
writer.writerow([row])
wdw_clean_assets.close()
wdw_to_add_ext_refs =[]
with open('wdw_to_add_ext_refs.csv', 'w') as wdw_to_add_ext_refs:
writer = csv.writer(wdw_to_add_ext_refs)
for row in to_add_ext_refs:
writer.writerow([row])
wdw_to_clean_assets = []
with open('wdw_to_clean_assets.csv', 'w') as wdw_to_clean_assets:
writer = csv.writer(wdw_to_clean_assets)
for row in to_clean_assets:
writer.writerow([row])
wdw_to_clean_assets.close()
wdw_hansi_assets_rest = []
with open('wdw_hansi_assets_rest.csv', 'w') as wdw_hansi_assets_rest:
writer = csv.writer(wdw_hansi_assets_rest)
for row in hansi_assets_rest:
writer.writerow([row])
wdw_hansi_assets_rest.close()
wdw_mediahub_assets_rest = []
with open('wdw_mediahub_assets_rest.csv', 'w') as wdw_mediahub_assets_rest:
writer = csv.writer(wdw_mediahub_assets_rest)
for row in mediahub_assets_rest:
writer.writerow([row])
wdw_mediahub_assets_rest.close()
Any help appreciated!
Manuel
I have a stock file in the format of this:
12345678,Fridge,1,50
23456789,Car,2,50
34567890,TV,20,50
This is the code:
def main():
products = {}
#This is the file directory being made.
f = open('stockfile.txt')
#This is my file being opened.
for line in f:
# Need to strip to eliminate end of line character
line = line[:-1]
#This gets rid of the character which shows and end of line '\n'
row = line.split(',')
#The row is split by the comma
products[row[0]] = [row[1], row[2],row[3]]
#The products are equal to row 1 and row 2 and row 3. The GTIN is going to take the values of the product and price so GTIN 12345678 is going to correspond to Fridge and 1.
print(products)
total = 0
print('Id Description Total')
while True:
GTIN = input('Please input GTIN ')
if(GTIN not in products):
print('Sorry your code was invalid, try again:')
break
row = products[GTIN]
print(GTIN)
description = row[0]
value = row[1]
stock = row[2]
print(stock)
quantity = input('Please also input your quantity required: ')
row[2]= int(stock) - int(quantity)
products[row[2]] = row[2]
product_total= (int(quantity)*int(value))
New_Stock = GTIN + ',' + description + ',' + value + ',' + str(products[row[2]])
f = open('stockfile.txt','r')
lines = f.readlines()
f.close()
f = open("stockfile.txt","a")
for row in lines:
if((row + '\n') != (New_Stock + '\n')):
f.write(New_Stock)
f.close()
print('%20s%20s%20s' % (GTIN, description, product_total))
total = total + product_total
print('Total of the order is £%s' % total)
print(products)
main()
However, the code doesn't update the stock's. What it should do is get rid of the previous stock for the product given and then update it according to the quantity the user has just bought.
I haven't got to it yet but once the stock hits zero I need my code to then tell the user that we have run out of stock and need some new stock. Then there needs to be a message to the user to wait until we restock and then display the price of restocking as well.
If you have time please could you make this new bit of code as well but if not could you just explain how to update the stock and why my code isn't working, thank you.
When you seek to a given line and call write, in order to completely overwrite that line, without affecting other lines or inadvertently creating new lines, each line in your stock must have a fixed width. How do you ensure fixed width? By giving each field in the record a fixed width. Basically, you choose a maximum number of characters each field can have; here I'll assume 8 for all fields (though they must not all be the same), so your stock will be stored this way:
12344848, Fridge, 2, 50
13738389, TV, 5, 70
If you keep going this way, each line will have a maximum width which will enable you to seek to the start of the line and overwrite it completely. Try this code:
MAX_FIELD_LEN = 8
def main():
products = {}
product_location = {}
location = 0
# This is the file directory being made.
with open('stockfile.txt', 'r+') as f:
# This is my file being opened.
for line in f:
# keep track of each products location in file to overwrite with New_Stock
product_location[line.split(',')[0]] = location
location += len(line)
# Need to strip to eliminate end of line character
line = line[:-1]
# This gets rid of the character which shows and end of line '\n'
row = line.split(',')
# The row is split by the comma
products[row[0]] = [row[1], row[2], row[3]]
# The products are equal to row 1 and row 2 and row 3. The GTIN is going to take the values of the product and price so GTIN 12345678 is going to correspond to Fridge and 1.
print(products)
total = 0
while True:
GTIN = input('Please input GTIN: ')
# To terminate user input, they just need to press ENTER
if GTIN == "":
break
if (GTIN not in products):
print('Sorry your code was invalid, try again:')
break
row = products[GTIN]
description, value, stock = row
print('Stock data: ')
print('GTIN \t\tDesc. \t\tStock \t\tValue')
print(GTIN,'\t',description,'\t', stock, '\t', value)
quantity = input('Please also input your quantity required: ')
row[2] = str(int(stock) - int(quantity))
product_total = int(quantity) * int(value)
for i in range(len(row)):
row[i] = row[i].rjust(MAX_FIELD_LEN)
New_Stock = GTIN.rjust(MAX_FIELD_LEN) + ',' + ','.join(row) + '\n'
#print(New_Stock, len(New_Stock))
f.seek(product_location[GTIN])
f.write(New_Stock)
print('You bought: {0} {1} \nCost: {2}'.format(GTIN, description, product_total))
total = total + product_total
f.close()
print('Total of the order is £%s' % total)
main()
Ensure that each field in the TXT file is exactly 8 characters wide (not including the commas) when using this program. If you want to increase the field width, change the MAX_FIELD_LEN variable accordingly. Your TXT file should look like this:
In the first few lines you are loading the whole data file in memory :
for line in f:
products[row[0]] = [row[1], row[2],row[3]]
So then just update the data in memory, and have users enter a special command : "save" to write the whole list to your file.
You could also catch your application process KILL signal so if a user hits ctrl + c, you can ask him if he wants to save before quitting.
And maybe save a temporary copy of the list to a file every few seconds.
I suggest you use the shelve module for this, if your customers intend to run this program many times. Reading the whole file into memory and writing it anew as text will become inefficient as your stock grows. Shelve creates persistent files (3 files to be exact) on your PC for storing your data. Most importantly, shelve would give you the same dict interface you want, so that all you have to do is call shelve.open() on the file and you can begin accessing/updating your stock using GTINs as keys. Its very straightforward, just look at the python manuals. If you really want a text file, you could have your program, iterate through the shelve file containing your stock (same as for a dictionary) and write keys (GTIN) and their values (your stock quantity) to a text file you opened. This way, you have easy intuitive access to your records and also a readable format in your TXT file.
MAX_FIELD_LEN = 8
def main():
products = {}
product_location = {}
location = 0
# This is the file directory being made.
with open('stockfile.txt', 'r+') as f:
# This is my file being opened.
for line in f:
# keep track of each products location in file to overwrite with New_Stock
product_location[line.split(',')[0]] = location
location += len(line)
# Need to strip to eliminate end of line character
line = line[:-1]
# This gets rid of the character which shows and end of line '\n'
row = line.split(',')
# The row is split by the comma
products[row[0]] = [row[1], row[2], row[3]]
# The products are equal to row 1 and row 2 and row 3. The GTIN is going to take the values of the product and price so GTIN 12345678 is going to correspond to Fridge and 1.
print(products)
total = 0
while True:
GTIN = input('Please input GTIN: ')
# To terminate user input, they just need to press ENTER
if GTIN == "":
break
if (GTIN not in products):
print('Sorry your code was invalid, try again:')
break
row = products[GTIN]
description, value, stock = row
print('Stock data: ')
print('GTIN \t\tDesc. \t\tStock \t\tValue')
print(GTIN,'\t',description,'\t', stock, '\t', value)
quantity = input('Please also input your quantity required: ')
row[2] = str(int(stock) - int(quantity))
product_total = int(quantity) * int(value)
for i in range(len(row)):
row[i] = row[i].rjust(MAX_FIELD_LEN)
New_Stock = GTIN.rjust(MAX_FIELD_LEN) + ',' + ','.join(row) + '\n'
#print(New_Stock, len(New_Stock))
f.seek(product_location[GTIN])
f.write(New_Stock)
print('You bought: {0} {1} \nCost: {2}'.format(GTIN, description, product_total))
total = total + product_total
f.close()
print('Total of the order is £%s' % total)
main()
This was the text file:
12345678, Fridge, 1, 50
23456789, Car, 2, 50
34567890, TV, 20, 50
Does it make a difference I am doing this on a Mac desktop or that it is python 3.4.3?
The suggestion above using shelve sounds like a good idea, but if you want to keep your file as is but only update the records that are changed (instead of rewriting the whole file each time) using (most) of your code, this seems to work.
def main():
products = {}
product_location = {}
location = 0
# This is the file directory being made.
with open('stockfile.txt', 'r+') as f:
# This is my file being opened.
for line in f:
# keep track of each products location in file to overwrite with New_Stock
product_location[line.split(',')[0]] = location
location += len(line)
# Need to strip to eliminate end of line character
line = line[:-1]
# The row is split by the comma
row = line.split(',')
products[row[0]] = [row[1], row[2], row[3]]
"""
The products are equal to row 1 and row 2 and row 3. The GTIN is going to take the values of the product and
price so GTIN 12345678 is going to correspond to Fridge and 1.
"""
print(sorted(products.items()))
total = 0
while True:
GTIN = input('\nPlease input GTIN or press [Enter] to quit:\n')
# To terminate user input, they just need to press ENTER
if GTIN == "":
break
if (GTIN not in products):
# Let the user continue with order after mistake in GTIN input
print('Sorry your code was invalid, try again:')
continue
row = products[GTIN]
print('GTIN:', GTIN)
description = row[0]
value = row[1]
stock = row[2]
stock_length = len(row[2])
backorder = 0
print('In Stock:', stock)
quantity = input('Please also input your quantity required:\n')
if int(quantity) > int(stock):
row[2] = 0
backorder = int(quantity) - int(stock)
# TO DO
Backordered_Stock = GTIN + ',' + description + ',' + value + ',' + str(backorder) + '\n'
else:
row[2] = int(stock) - int(quantity)
products[row[2]] = row[2]
product_total = (int(quantity) * int(value))
New_Stock = GTIN + ',' + description + ',' + value + ',' + str(products[row[2]]).rjust(stock_length) + '\n'
f.seek(product_location[GTIN])
f.write(New_Stock)
print('Ordered - {0:>6} GTIN: {1:>10} Desc: {2:<20} at £{3:>6} Total value: £{4:>6} On backorder: {5:>4}'.
format(int(quantity), GTIN, description, int(value), product_total, backorder))
total = total + product_total
print('Total of the order is £%s' % total)
main()
Brand new to programming but very enjoyable challenge.
Here's a question which I suspect may be caused by a misunderstanding of python loops.
System info: Using notepad++ and IDLE python 3.4.3 on Win 7 32-bit
My solution is to open 1 database, use it to look for a correct master entry from database 2, pulls a index number (task_no), then write a 3rd file identical to the first database, this time with the correct index number.
My problem is that it performs 1st and 2nd loop correctly, then on the 2nd iteration of loop 1, tries to perform a block in loop 2 while iterating through the rows of loop 1, not the task_rows of loop 2.
footnote: Both files are quite large (several MB) so I'm note sure if storing them in memory is a good idea.
This was a relevant question that I found closest to this problem:
python nested loop using loops and files
What I got out of it was that I tried moving the file opening within the 1st loop, but the problem persists. Something to do with how I'm using CSV reader?
I also have the sinking suspicion that there may be a root cause in problem solving so I am welcome to suggestions for alternative ways to solve the problem.
Thanks in advance!
The gist:
for row in readerCurrentFile: #LOOP 1
# iterates through readerCurrentFile to define search variables
[...]
for task_row in readerTaskHeader: #LOOP 2
# searches each row iteratively through readerTaskHeader
# Match compid
#if no match, continue <<<- This is where it goes back to 1st loop
[...]
# Match task frequency
#if no match, continue
[...]
# once both of the above matches check out, will grab data (task_no from task_row[0]
task_no = ""
task_no = task_row[0]
if task_row:
break
[...]
# writes PM code
print("Successful write of PM schedule row")
print(compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ": " + pmid + " " + task_no)
The entire code:
import csv
import re
#Writes schedule
csvNewPMSchedule = open('new_pm_schedule.csv', 'a', newline='')
writerNewPMSchedule = csv.writer(csvNewPMSchedule)
# Dictionaries of PM Frequency
def re_compile_dict(d,f):
for k in d:
d[k] = re.compile(d[k], flags=f)
dict_month = {60:'Quin',36:'Trien',24:'Bi-An',12:'Annual(?<!Bi-)(?<!Semi-)',6:'Semi-An',3:'Quart',2:'Bi-Month',1:'Month(?<!Bi-)'}
dict_week = {2:'Bi-Week',1:'Week(?<!Bi-)'}
dict_freq_names = {'60Months':'Quintennial','36Months':'Triennial','24Months':'Bi-Annual','12Months':'Annual','6Months':'Semi-Annual','3Months':'Quarterly','2Months':'Bi-Monthly','1Months':'Monthly','2Weeks':'Bi-Weekly','1Weeks':'Weekly'}
re_compile_dict(dict_month,re.IGNORECASE)
re_compile_dict(dict_week, re.IGNORECASE)
# Unique Task Counter
task_num = 0
total_lines = 0
#Error catcher
error_in_row = []
#Blank out all rows
pmid = 0
compid = 0
comp_desc = 0
pmfreqx = 0
pmfreq = 0
pmfreqtype = 0
# PM Schedule Draft (as provided by eMaint)
currentFile = open('pm_schedule.csv', encoding='windows-1252')
readerCurrentFile = csv.reader(currentFile)
# Loop 1
for row in readerCurrentFile:
if row[0] == "pmid":
continue
#defines row items
pmid = row[0]
compid = row[1]
comp_desc = row[2]
#quantity of pm frequency
pmfreqx_temp = row[3]
#unit of pm frequency, choices are: Months, Weeks
pmfreq = row[4]
#pmfreqtype is currently only static not sure what other options we have
pmfreqtype = row[5]
#pmnextdate is the next scheduled due date from this one. we probably need logic later that closes out any past due date
pmnextdate = row[6]
# Task Number This is what we want to change
# pass
# We want to change this to task header's task_desc
sched_task_desc = row[8]
#last done date
last_pm_date = row[9]
#
#determines frequency search criteria
#
try:
pmfreqx = int(pmfreqx_temp)
except (TypeError, ValueError):
print("Invalid PM frequency data, Skipping row " + pmid)
error_in_row.append(pmid)
continue
#
#defines frequency search variable
#
freq_search_var = ""
if pmfreq == "Weeks":
freq_search_var = dict_week[pmfreqx]
elif pmfreq == "Months":
freq_search_var = dict_month[pmfreqx]
if not freq_search_var:
print("Error in assigning frequency" + compid + " " + str(pmfreqx) + " " + pmfreq)
error_in_row.append(pmid)
continue
#defines Equipment ID Search Variable
print(compid + " frequency found: " + str(pmfreqx) + " " + str(pmfreq))
compid_search_var = re.compile(compid,re.IGNORECASE)
#
# Matching function - search taskHeader for data
#
#PM Task Header Reference
taskHeader = open('taskheader.csv', encoding='windows-1252')
readerTaskHeader = csv.reader(taskHeader)
for task_row in readerTaskHeader:
# task_row[0]: taskHeader pm number
# task_row[1]: "taskHeader task_desc
# task_row[2]: taskHeader_task_notes
#
# search for compid
compid_match = ""
compid_match = compid_search_var.search(task_row[1])
if not compid_match:
print(task_row[1] + " does not match ID for " + compid + ", trying next row.") #debug 2
continue # <<< STOPS ITERATING RIGHT OVER HERE
print("Found compid " + task_row[1]) # debug line
#
freq_match = ""
freq_match = freq_search_var.search(task_row[1])
if not freq_match:
print(task_row[1] + " does not match freq for " + compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ", trying next row.") #debug line
continue
print("Frequency Match: " + compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)]) # freq debug line
#
task_no = ""
print("Assigning Task Number to " + task_row[0])
task_no = task_row[0]
if task_row:
break
#
#error check
#
if not task_no:
print("ERROR IN SEARCH " + compid + " " + pmid)
error_in_row.append(pmid)
continue
#
# Writes Rows
#
writerNewPMSchedule.writerow([pmid,compid,comp_desc,pmfreqx,pmfreq,pmfreqtype,pmnextdate,task_no,sched_task_desc,last_pm_date])
print("Successful write of PM schedule row")
print(compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ": " + pmid + " " + task_no)
print("==============")
# Error reporting lined out for now
# for row in error_in_row:
# writerNewPMSchedule.writerow(["Error in row:",str(error_in_row[row])])
# print("Error in row: " + str(error_in_row[row]))
print("Finished")
I'm writing a python script that works with two csv files. Lets call them csv1.csv (original file to read) and csv2.csv (exact copy of csv1). The goal is to find the row and column in the csv file that corresponds to the the modified user-defined input.
csv format:(continues for about 2-3 thousand lines)
record LNLIM, ID_CO,OD_DV,ID_LN, ST_LN, ZST_LN, ID_LNLIM,LIMIT1_LNLIM, LIMIT2_LNLIM, LIMIT3_LNLIM
LNLIM, 'FPL', 'SOUT', '137TH_LEVEE_B', 'B', '137TH_AV', 'LEVEE', 'A', 1000, 1100, 1200
LNLIM, 'FPL', 'SOUT', '137TH_DAVIS_B', 'A', '137TH_AV', 'NEWTON', 'A', 1000, 1100, 1200
...
Let's say that the user is looking for 137TH_AV and NEWTON. I want to be able to go row by row and compare the two columns/row indices ST_LN and ZST_LN. If both columns match what the user inputted then I want to capture which row in the csv file that happened on, and use that information to edit the remaining columns LIMIT1_LNLIM LIMIT2_LNLIM LIMIT3_LNLIM on that row with new analog values.
I want to get the 3 new values provided by the user and edit a specific row, and a specific row element. Once I've found the place to replace the number values I want to overwrite csv2.csv with this edit.
Determining where the line segment is located in the array
import sys
import csv
import os
import shutil
LineSectionNames = []
ScadaNames = []
with open('Vulcan_Imp_Summary.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
LineSectionName = row[1]
ScadaName = row[29]
LineSectionNames.append(LineSectionName)
ScadaNames.append(ScadaName)
#Reformatting arrays for accurate references
LineSectionNames = [character.replace('\xa0', ' ') for character in LineSectionNames]
LineSectionNames = [character.replace('?', '-') for character in LineSectionNames]
ScadaNames = [character.replace('\xa0', ' ') for character in ScadaNames]
#Setting Line Section name as key and Scada name as value
ScadaDict = {}
for i in range(len(LineSectionNames)):
ScadaDict[LineSectionNames[i]] = ScadaNames[i]
#Prompt user for grammatical name of Line Section
print ('Enter the Line Section Name: (Example = Goulds-Princeton) \n')
user_input = input()
#Reference user input to dictionary value to convert input into SCADA format
def reformat():
print ('Searching for Line Section...' + user_input)
if user_input in ScadaDict:
value = ScadaDict[user_input]
print ('\n\t Match!\n')
else:
print ('The Line Section name you have entered was incorrect. Try again. \n Example = Goulds-Princeton')
reformat()
# Copying the exported file from Genesys
path = 'I://PSCO//DBGROUP//PatrickL//'
shutil.copyfile(path + 'lnlim_import.csv', path + 'lnlim_import_c.csv')
#Using the SCADA format to search through csv file
print ('Searching csv file for...' + user_input)
# Reading the copied file
record_lnlims = []
id_cos = []
id_dvs = []
id_lines = []
id_lns = []
st_lns = []
zst_lns = []
id_lnlims = []
limit1_lnlims = []
limit2_lnlims = []
limit3_lnlims = []
with open('lnlim_import_c.csv', 'r') as copy:
reader = csv.reader(copy)
for row in reader:
record_lnlim = row[0]
id_co = row[1]
id_dv = row[2]
id_line = row[3]
id_ln = row[4]
st_ln = row[5]
zst_ln = row[6]
id_lnlim = row[7]
limit1_lnlim = row[8]
limit2_lnlim = row[9]
limit3_lnlim = row[10]
record_lnlims.append(record_lnlim)
id_cos.append(id_co)
id_dvs.append(id_dv)
id_lines.append(id_line)
id_lns.append(id_ln)
st_lns.append(st_ln)
zst_lns.append(zst_ln)
id_lnlims.append(id_lnlim)
limit1_lnlims.append(limit1_lnlim)
limit2_lnlims.append(limit2_lnlim)
limit3_lnlims.append(limit3_lnlim)
#Reformatting the user input from GOULDS-PRINCETON to 'GOULDS' and 'PRINCETON'
input_split = user_input.split('-', 1)
st_ln1 = input_split[0]
zst_ln1 = input_split[1]
st_ln2 = st_ln1.upper()
zst_ln2 = zst_ln1.upper()
st_ln3 = "'" + str(st_ln2) + "'"
zst_ln3 = "'" + str(zst_ln2) + "'"
#Receiving analog values from user
print ('\n\t Found! \n')
print ('Enter the Specified Emergency Rating (A) for 110% for 7 minutes: ')
limit1_input = input()
print ('Enter the Specified Emergency Rating (A) for 120% for 7 minutes: ')
limit2_input = input()
print ('Enter the Specified Emergency Rating (A) for 130% for 5 minutes: ')
limit3_input = input()
Whenever I print the row_index it prints the initialized value of 0.
i = 0
row_index = 0
for i in range(len(st_lns)):
if st_ln3 == st_lns[i] and zst_ln3 == zst_lns[i]:
row_index = i
print(row_index)
limit1_input = limit1_lnlims[row_index]
limit2_input = limit2_lnlims[row_index]
limit3_input = limit3_lnlims[row_index]
csv_list = []
csv_list.append(record_lnlims)
csv_list.append(id_cos)
csv_list.append(id_dvs)
csv_list.append(id_lines)
csv_list.append(st_lns)
csv_list.append(zst_lns)
csv_list.append(id_lnlims)
csv_list.append(limit1_lnlims)
csv_list.append(limit2_lnlims)
csv_list.append(limit3_lnlims)
#Editing the csv file copy to implement new analog values
with open('lnlim_import_c.csv', 'w') as edit:
for x in zip(csv_list):
edit.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\n".format(x))
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)