Python Nested Loops - continue iterates first loop - python

Brand new to programming but very enjoyable challenge.
Here's a question which I suspect may be caused by a misunderstanding of python loops.
System info: Using notepad++ and IDLE python 3.4.3 on Win 7 32-bit
My solution is to open 1 database, use it to look for a correct master entry from database 2, pulls a index number (task_no), then write a 3rd file identical to the first database, this time with the correct index number.
My problem is that it performs 1st and 2nd loop correctly, then on the 2nd iteration of loop 1, tries to perform a block in loop 2 while iterating through the rows of loop 1, not the task_rows of loop 2.
footnote: Both files are quite large (several MB) so I'm note sure if storing them in memory is a good idea.
This was a relevant question that I found closest to this problem:
python nested loop using loops and files
What I got out of it was that I tried moving the file opening within the 1st loop, but the problem persists. Something to do with how I'm using CSV reader?
I also have the sinking suspicion that there may be a root cause in problem solving so I am welcome to suggestions for alternative ways to solve the problem.
Thanks in advance!
The gist:
for row in readerCurrentFile: #LOOP 1
# iterates through readerCurrentFile to define search variables
[...]
for task_row in readerTaskHeader: #LOOP 2
# searches each row iteratively through readerTaskHeader
# Match compid
#if no match, continue <<<- This is where it goes back to 1st loop
[...]
# Match task frequency
#if no match, continue
[...]
# once both of the above matches check out, will grab data (task_no from task_row[0]
task_no = ""
task_no = task_row[0]
if task_row:
break
[...]
# writes PM code
print("Successful write of PM schedule row")
print(compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ": " + pmid + " " + task_no)
The entire code:
import csv
import re
#Writes schedule
csvNewPMSchedule = open('new_pm_schedule.csv', 'a', newline='')
writerNewPMSchedule = csv.writer(csvNewPMSchedule)
# Dictionaries of PM Frequency
def re_compile_dict(d,f):
for k in d:
d[k] = re.compile(d[k], flags=f)
dict_month = {60:'Quin',36:'Trien',24:'Bi-An',12:'Annual(?<!Bi-)(?<!Semi-)',6:'Semi-An',3:'Quart',2:'Bi-Month',1:'Month(?<!Bi-)'}
dict_week = {2:'Bi-Week',1:'Week(?<!Bi-)'}
dict_freq_names = {'60Months':'Quintennial','36Months':'Triennial','24Months':'Bi-Annual','12Months':'Annual','6Months':'Semi-Annual','3Months':'Quarterly','2Months':'Bi-Monthly','1Months':'Monthly','2Weeks':'Bi-Weekly','1Weeks':'Weekly'}
re_compile_dict(dict_month,re.IGNORECASE)
re_compile_dict(dict_week, re.IGNORECASE)
# Unique Task Counter
task_num = 0
total_lines = 0
#Error catcher
error_in_row = []
#Blank out all rows
pmid = 0
compid = 0
comp_desc = 0
pmfreqx = 0
pmfreq = 0
pmfreqtype = 0
# PM Schedule Draft (as provided by eMaint)
currentFile = open('pm_schedule.csv', encoding='windows-1252')
readerCurrentFile = csv.reader(currentFile)
# Loop 1
for row in readerCurrentFile:
if row[0] == "pmid":
continue
#defines row items
pmid = row[0]
compid = row[1]
comp_desc = row[2]
#quantity of pm frequency
pmfreqx_temp = row[3]
#unit of pm frequency, choices are: Months, Weeks
pmfreq = row[4]
#pmfreqtype is currently only static not sure what other options we have
pmfreqtype = row[5]
#pmnextdate is the next scheduled due date from this one. we probably need logic later that closes out any past due date
pmnextdate = row[6]
# Task Number This is what we want to change
# pass
# We want to change this to task header's task_desc
sched_task_desc = row[8]
#last done date
last_pm_date = row[9]
#
#determines frequency search criteria
#
try:
pmfreqx = int(pmfreqx_temp)
except (TypeError, ValueError):
print("Invalid PM frequency data, Skipping row " + pmid)
error_in_row.append(pmid)
continue
#
#defines frequency search variable
#
freq_search_var = ""
if pmfreq == "Weeks":
freq_search_var = dict_week[pmfreqx]
elif pmfreq == "Months":
freq_search_var = dict_month[pmfreqx]
if not freq_search_var:
print("Error in assigning frequency" + compid + " " + str(pmfreqx) + " " + pmfreq)
error_in_row.append(pmid)
continue
#defines Equipment ID Search Variable
print(compid + " frequency found: " + str(pmfreqx) + " " + str(pmfreq))
compid_search_var = re.compile(compid,re.IGNORECASE)
#
# Matching function - search taskHeader for data
#
#PM Task Header Reference
taskHeader = open('taskheader.csv', encoding='windows-1252')
readerTaskHeader = csv.reader(taskHeader)
for task_row in readerTaskHeader:
# task_row[0]: taskHeader pm number
# task_row[1]: "taskHeader task_desc
# task_row[2]: taskHeader_task_notes
#
# search for compid
compid_match = ""
compid_match = compid_search_var.search(task_row[1])
if not compid_match:
print(task_row[1] + " does not match ID for " + compid + ", trying next row.") #debug 2
continue # <<< STOPS ITERATING RIGHT OVER HERE
print("Found compid " + task_row[1]) # debug line
#
freq_match = ""
freq_match = freq_search_var.search(task_row[1])
if not freq_match:
print(task_row[1] + " does not match freq for " + compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ", trying next row.") #debug line
continue
print("Frequency Match: " + compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)]) # freq debug line
#
task_no = ""
print("Assigning Task Number to " + task_row[0])
task_no = task_row[0]
if task_row:
break
#
#error check
#
if not task_no:
print("ERROR IN SEARCH " + compid + " " + pmid)
error_in_row.append(pmid)
continue
#
# Writes Rows
#
writerNewPMSchedule.writerow([pmid,compid,comp_desc,pmfreqx,pmfreq,pmfreqtype,pmnextdate,task_no,sched_task_desc,last_pm_date])
print("Successful write of PM schedule row")
print(compid + " " + dict_freq_names[str(pmfreqx) + str(pmfreq)] + ": " + pmid + " " + task_no)
print("==============")
# Error reporting lined out for now
# for row in error_in_row:
# writerNewPMSchedule.writerow(["Error in row:",str(error_in_row[row])])
# print("Error in row: " + str(error_in_row[row]))
print("Finished")

Related

How do I read, loop, compare, manipulate and write CSV File in Python 2 correct

I tried a lot now, and also read a lot the past days, but I can't come up with a correct solution to my problem. Maybe someone can give me helping hand.
I have to CSV Files where wdw_clip_db_2018-01-17_2 (4720 lines) should have all data and wdw_content_complete (2752 lines) only a subset.
wdw_clip_db_2018-01-17_2.csv:
11,0_7cjgob0v,9000301_AzubiGlueckT1.mxf,0_7cjgob0v.mpg
43,0_heor15yl,,0_heor15yl.mpg
1616,0_dfopff5t,578_Bier.MXF,0_dfopff5t.mpg
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
1931,0_cbx3zgw6,9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6.mpg
wdw_content_complete.csv:
1737,9000301_AzubiGlueckT1.mxf,0_7cjgob0v
1451,578_Bier.MXF,0_dfopff5t
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
1762,9070201_KeinGeldFuerGeschen.mxf,NULL
What I need to come up with is are the following csv files readable by Excel:
wdw_clean_assets.csv:
9000301_AzubiGlueckT1.mxf,0_7cjgob0v
578_Bier.MXF,0_dfopff5t
Where wdw_clean_assets holds every line which matches the file and the external_refernce (e.g. 0_7cjgob0v).
wdw_to_add_ext_refs.csv:
9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6
Where wdw_to_add_ext_refs holds every line which matches the file but has a NULL in the external_reference field. The NULL is replaced with the external reference found in wdw_clip_db_2018-01-17_2.csv.
When I compare the number of lines, there seam to be lines in wdw_content_complete.csv which are not in wdw_clip_db_2018-01-17_2.csv. To be honest this shouldn't be so I need to find out what's wrong with these lines. Therefore I need to put the Rest of wdw_content_complete.csv in a new CSV file.
wdw_to_clean_assets.csv:
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
And finally I need the Rest of both CSVs wdw_clip_db_2018-01-17_2.csv and wdw_content_complete.csv in two separate CSVs. Therefore I tried to somehow substract one list from another which unfortunately also doesn't work correct.
wdw_hansi_assets_rest.csv:
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
wdw_mediahub_assets_rest.csv:
1433,445_Holzverarbeitung.MXF,NULL
What I got so far is this Python Script:
import csv
# CSV Files
# wdw_clip_db_2018-01-17_2.csv
# wdw_content_complete.csv
# Reading the CSV Files
hansi_assets = []
with open('wdw_clip_db_2018-01-17_2.csv') as hansi_db:
reader = csv.reader(hansi_db)
for row in reader:
hansi_assets.append(row)
hansi_db.close()
mediahub_assets = []
with open('wdw_content_complete.csv') as mediahub_db:
reader = csv.reader(mediahub_db)
for row in reader:
mediahub_assets.append(row)
mediahub_db.close()
clean_asset = []
clean_assets = []
to_add_ext_ref = []
to_add_ext_refs = []
to_clean_assets = []
hansi_assets_rest = []
mediahub_assets_rest = []
hansi_assets_rm = []
mediahub_assets_rm = []
num_clean_rwos = 0
num_to_clean_rows = 0
num_to_add_ext_refs = 0
num_dirty_rows = 0
num_hansi_iterations = 0
num_mediahub_iterations = 0
num_mediahub_null = 0
num_hansi_mediahub_matches = 0
# Looping over the CSV Files
for hansi_asset in hansi_assets:
num_hansi_iterations += 1
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
# Checking if there are similar, clean entries
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and hansi_asset[1] == mediahub_asset[2]:
clean_assets.append(mediahub_asset)
# Counting for evaluation reasons
num_clean_rwos += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries which miss the Ext_Ref field and replacing the NULL by the corresponding Ext_Ref in the hansi_asset
elif hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and mediahub_asset[2] == "NULL":
to_add_ext_ref = [mediahub_asset[1], hansi_asset[1]]
to_add_ext_refs.append(to_add_ext_ref)
# Counting for evaluation reasons
num_to_add_ext_refs += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries that don't match
elif hansi_asset[2] != mediahub_asset[1] or hansi_asset[3] != mediahub_asset[1]:
to_clean_assets.append([mediahub_asset[1], mediahub_asset[2]])
# Counting for evaluation reasons
num_to_clean_rows += 1
# Creating a list to substract from its origin to get the Rest
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Just counting the Matches
for hansi_asset in hansi_assets:
for mediahub_asset in mediahub_assets:
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1]:
num_hansi_mediahub_matches += 1
# Just counting the NULLs
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
if mediahub_asset[2] == "NULL":
num_mediahub_null += 1
# for mediahub_asset_rm in mediahub_assets_rm:
# if mediahub_asset[1] != mediahub_asset_rm[1]:
# mediahub_assets_rest = Diff(mediahub_assets, mediahub_assets_rm)
# Trying to substract medihub_assets_rm from mediahub_assets to get the Rest
mediahub_assets_rest = [item for item in mediahub_assets_rm if item not in mediahub_assets]
hansi_assets_rest = [item for item in hansi_assets_rm if item not in hansi_assets]
# Printing some lines for evaluation
print hansi_assets[1]
print mediahub_assets[1]
print clean_assets[1]
print to_clean_assets[1]
print to_add_ext_refs[1]
print hansi_assets_rest[1]
print mediahub_assets_rest[1]
print hansi_assets_rm[1]
print mediahub_assets_rm[1]
print "Num Hansi Assets: " + str(len(hansi_assets))
print "Num Mediahub Assets: " + str(len(mediahub_assets))
print "Num Clean Assets: " + str(len(clean_assets))
print "Num Hansi Assets to remove: " + str(len(hansi_assets_rm))
print "Num Mediahub Assets to remove: " + str(len(mediahub_assets_rm))
print "Num Hansi Rest Assets: " + str(len(hansi_assets_rest))
print "Num Mediahub Rest Assets: " + str(len(mediahub_assets_rest))
print "Num Mediahub NULLs: " + str(num_mediahub_null)
print "Num Hansi Mediahub Matches: " + str(num_hansi_mediahub_matches)
print "Num Clean Rows: " + str(num_clean_rwos)
print "Num To Clean Rows: " + str(num_to_clean_rows)
print "Num To Add Ext_Ref: " + str(num_to_add_ext_refs)
print "Num Dirty Rows: " + str(num_dirty_rows)
print "Num Hansi Iterations: " + str(num_hansi_iterations)
print "Num Mediahub Iterations: " + str(num_mediahub_iterations / num_hansi_iterations)
# Writing clean_assets to a file
wdw_clean_assets = []
with open('wdw_clean_assets.csv', 'w') as wdw_clean_assets:
writer = csv.writer(wdw_clean_assets)
for row in clean_assets:
writer.writerow([row])
wdw_clean_assets.close()
wdw_to_add_ext_refs =[]
with open('wdw_to_add_ext_refs.csv', 'w') as wdw_to_add_ext_refs:
writer = csv.writer(wdw_to_add_ext_refs)
for row in to_add_ext_refs:
writer.writerow([row])
wdw_to_clean_assets = []
with open('wdw_to_clean_assets.csv', 'w') as wdw_to_clean_assets:
writer = csv.writer(wdw_to_clean_assets)
for row in to_clean_assets:
writer.writerow([row])
wdw_to_clean_assets.close()
wdw_hansi_assets_rest = []
with open('wdw_hansi_assets_rest.csv', 'w') as wdw_hansi_assets_rest:
writer = csv.writer(wdw_hansi_assets_rest)
for row in hansi_assets_rest:
writer.writerow([row])
wdw_hansi_assets_rest.close()
wdw_mediahub_assets_rest = []
with open('wdw_mediahub_assets_rest.csv', 'w') as wdw_mediahub_assets_rest:
writer = csv.writer(wdw_mediahub_assets_rest)
for row in mediahub_assets_rest:
writer.writerow([row])
wdw_mediahub_assets_rest.close()
Any help appreciated!
Manuel

Python parsing csv file to store in file

I have not used Python in years and trying to get back into it. I have a Input_file (.csv) that I want to parse and store the output in a output.csv or .txt
I have managed to parse the .csv file using this code, and for the most part the it works but I cant get it save to save to file (Issue 1) without getting the below error (error 1)
import csv
import re
import itertools
file_name = 'PhoneCallData1.txt'
try:
lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))
count =0
except:
print('File cannot be opened:',file_name)
exit()
try:
fout = open('output.txt','w')
except:
Print("File cannot be written to:","OutputFile")
exit()
d = dict()
for item in itertools.chain(lol): # Lists all items (field) in the CSV file.
count +=1 # counter to keep track of row im looping through
if lol[count][3] is None:
print("value is not blank")
count +=1
else:
try:
check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) # check to determine if date is a date
except:
continue
check_cost = re.compile(r'($+\d*)', lol[count][9]) # check to determine if value is a cost
if check_date ==TRUE:
try:
key =lol[count][3] # If is a date value, store key
except ValueError:
continue
if check_cost==TRUE:
value = lol[count][9] # if is a cost ($) store value
d[key] = value
print (d[key])
# fout.write((d[key])
# What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charge
and now comes the complicated part. The file I need to parse has a number of irrelevant rows of data before and in between the required data.
Data Format
What I want to do;
I want to iterate over two columns of data, and only store the rows that have a date or cost in them, dis-guarding the rest of the data.
import csv
import re
import itertools
lol = list(csv.reader(open('PhoneCallData1.txt', 'r'), delimiter=' '))
count =0
d = dict()
for item in itertools.chain(lol): #Lists all items (field) in the CSV file.
count +=1 # counter to keep track of row im looping through
check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) #check to determine
check_cost = re.compile(r'($+\d*)', lol[count][9]) #check to determine if value is a cost
if check_date ==TRUE:
key =lol[count][3] #If is a date value, store key
if check_cost==TRUE:
value = lol[count][9] #if is a cost ($) store value
d[key] = value
print (d[key])
#What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charges
What I have tried;
I tried to index the data after I loaded it, but that didn't seem to work.
I created this to only look at rows at that were more than a certain length, but its terrible code. I was hoping for something more practical and reusable.
import re
with open('PhoneCallData1.txt','r') as f, open('sample_output.txt','w') as fnew:
for line in f:
if len(line) > 50:
print(line)
fnew.write(line + '\n')
Import csv
lol = list(csv.reader(open('PhoneCallData1.txt', 'rb'), delimiter='\t'))
#d = dict()
#key = lol[5][0] # cell A7
#value = lol[5][3] # cell D7
#d[key] = value # add the entry to the dictionary
Keep getting index out of bounds errors
import re
import csv
match=re.search(r'(\d+/\d+/\d+)','testing date 11/12/2017')
print match.group(1)
Trying to use regex to search for the date in the first column of data.
NOTE: I wanted to try Pandas but I feel I need to start here. Any Help would be awesome.
answer to if next record need to be parsed must be specific, and I have answer a similar question, in the same way, finite-state machine may help
main code is:
state = 'init'
output = []
# for line loop:
if state == 'init': # seek for start parsing
# check if start parsing
state = 'start'
elif state == 'start': # start parsing now
# parsing
# check if need to end parsing
state = 'init'
import csv
import re
import itertools
import timeit
start_time = timeit.default_timer()
# code you want to evaluate
file_name = 'PhoneCallData.txt'
try:
lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))
except:
print('File cannot be opened:', file_name)
exit()
try:
fout = open('output.txt','w')
except:
Print("File cannot be written to:","OutputFile")
exit()
# I could assign key value pairs and store in dictionry. Then print, search,ect on the dictionary. Version2
# d = dict()
count =0
total = 0
for row in lol: # Lists all items (field) in the CSV file.
#print(len(row))
count +=1 # counter to keep track of row im looping through
if len(row) == 8:
if row[2].isdigit():
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[7])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + (row[7]) + " : " + str(total) + "\n")
print (string)
fout.write(string)
if len(row) == 9:
if row[2].isdigit():
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[8])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + row[8] + " : " + str(total) + "\n")
print(string)
fout.write(string)
if len(row) == 10:
# print (row[2] +":"+ row[9])
# Remove the $ and convert to float
cost = re.sub('[$]', '', row[9])
# Assign total value
try:
# Calculate total for verification purposes
total = total + float(cost)
total = round(total, 2)
except:
continue
string = str(row[2] + " : " + row[9] + " : " + str(total) + "\n")
print(string)
fout.write(string)
# Convert to string so I can print and store in file
count_string = str(count)
total_string = str(total)
total_string.split('.', 2)
# Write to screen
print (total_string + " Total\n")
print("Rows parsed :" + count_string)
# write to file
fout.write(count_string + " Rows were parsed\n")
fout.write(total_string + " Total")
# Calcualte time spent on task
elapsed = timeit.default_timer() - start_time
round_elapsed = round(elapsed, 2)
string_elapsed = str(round_elapsed)
fout.write(string_elapsed)
print(string_elapsed + " seconds")
fout.close()

Python function performance

I have 130 lines of code in which part except from line 79 to 89 work fine like compiles in ~0.16 seconds however after adding function which is 10 lines(between 79-89) it works in 70-75 seconds. In that function the data file(u.data) is 100000 lines of numerical data in this format:
>196 242 3 881250949
4 grouped numbers in every line. The thing is that when I ran that function in another Python file while testing (before implementing it in the main program) it showed that it works in 0.15 seconds however when I implemented it in main one (same code) it takes whole program 70 seconds almost.
Here is my code:
""" Assignment 5: Movie Reviews
Date: 30.12.2016
"""
import os.path
import time
start_time = time.time()
""" FUNCTIONS """
# Getting film names in film folder
def get_film_name():
name = ''
for word in read_data.split(' '):
if ('(' in word) == False:
name += word + ' '
else:
break
return name.strip(' ')
# Function for removing date for comparison
def throw_date(string):
a_list = string.split()[:-1]
new_string = ''
for i in a_list:
new_string += i + ' '
return new_string.strip(' ')
def film_genre(film_name):
oboist = []
genr_list = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy',
'Movie-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for item in u_item_list:
if throw_date(str(item[1])) == film_name:
for i in range(4, len(item)):
oboist.append(item[i])
dictionary = dict(zip(genr_list, oboist))
genres = ''
for key, value in dictionary.items():
if value == '1':
genres += key + ' '
return genres.strip(' ')
def film_link(film_name):
link = ''
for item in u_item_list:
if throw_date(str(item[1])) == film_name:
link += item[3]
return link
def film_review(film_name):
review = ''
for r, d, filess in os.walk('film'):
for fs in filess:
fullpat = os.path.join(r, fs)
with open(fullpat, 'r') as a_file:
data = a_file.read()
if str(film_name).lower() in str(data.split('\n', 1)[0]).lower():
for i, line in enumerate(data):
if i > 1:
review += line
a_file.close()
return review
def film_id(film_name):
for film in u_item_list:
if throw_date(film[1]) == film_name:
return film[0]
def total_user_and_rate(film_name):
rate = 0
user = 0
with open('u.data', 'r') as data_file:
rate_data = data_file.read()
for l in rate_data.split('\n'):
if l.split('\t')[1] == film_id(film_name):
user += 1
rate += int(l.split('\t')[2])
data_file.close()
print('Total User:' + str(int(user)) + '\nTotal Rate: ' + str(rate / user))
""" MAIN CODE"""
review_file = open("review.txt", 'w')
film_name_list = []
# Look for txt files and extract the film names
for root, dirs, files in os.walk('film'):
for f in files:
fullpath = os.path.join(root, f)
with open(fullpath, 'r') as file:
read_data = file.read()
film_name_list.append(get_film_name())
file.close()
with open('u.item', 'r') as item_file:
item_data = item_file.read()
item_file.close()
u_item_list = []
for line in item_data.split('\n'):
temp = [word for word in line.split('|')]
u_item_list.append(temp)
film_name_list = [i.lower() for i in film_name_list]
updated_film_list = []
print(u_item_list)
# Operation for review.txt
for film_data_list in u_item_list:
if throw_date(str(film_data_list[1]).lower()) in film_name_list:
strin = film_data_list[0] + " " + film_data_list[1] + " is found in the folder" + '\n'
print(film_data_list[0] + " " + film_data_list[1] + " is found in the folder")
updated_film_list.append(throw_date(str(film_data_list[1])))
review_file.write(strin)
else:
strin = film_data_list[0] + " " + film_data_list[1] + " is not found in the folder. Look at " + film_data_list[
3] + '\n'
print(film_data_list[0] + " " + film_data_list[1] + " is not found in the folder. Look at " + film_data_list[3])
review_file.write(strin)
total_user_and_rate('Titanic')
print("time elapsed: {:.2f}s".format(time.time() - start_time))
And my question is what can be the reason for that? Is the function
("total_user_and_rate(film_name)")
problematic? Or can there be other problems in other parts? Or is it normal because of the file?
I see a couple of unnecessary things.
You call film_id(film_name) inside the loop for every line of the file, you really only need to call it once before the loop.
You don't need to read the file, then split it to iterate over it, just iterate over the lines of the file.
You split each line twice, just do it once
Refactored for these changes:
def total_user_and_rate(film_name):
rate = 0
user = 0
f_id = film_id(film_name)
with open('u.data', 'r') as data_file:
for line in data_file:
line = line.split('\t')
if line[1] == f_id:
user += 1
rate += int(line[2])
data_file.close()
print('Total User:' + str(int(user)) + '\nTotal Rate: ' + str(rate / user))
In your test you were probably testing with a much smaller u.item file. Or doing something else to ensure film_id was much quicker. (By quicker, I mean it probably ran on the nanosecond scale.)
The problem you have is that computers are so fast you didn't realise when you'd actually made a big mistake doing something that runs "slowly" in computer time.
If your if l.split('\t')[1] == film_id(film_name): line takes 1 millisecond, then when processing a 100,000 line u.data file, you could expect your total_user_and_rate function to take 100 seconds.
The problem is that film_id iterates all your films to find the correct id for every single line in u.data. You'd be lucky, if the the film_id you're looking for is near the beginning of u_item_list because then the function would return in probably less than a nanosecond. But as soon as you run your new function for a film near the end of u_item_list, you'll notice performance problems.
wwii has explained how to optimise the total_user_and_rate function. But you could also gain performance improvements by changing u_item_list to use a dictionary. This would improve the performance of functions like film_id from O(n) complexity to O(1). I.e. it would still run on the nanosecond scale no matter how many films are included.

Twitter API 404 errors - Python (continued)

I'm trying to use the GET users/lookup from the twitter API in a python script to identify screen name based on a list of users IDs. The script doesn't seem to be able to handle 404 errors, as I assume the whole request with 100 user IDs is not found by twitter, so the for loop doesn't even begin. How do I iterate over 100 user IDs at a time, while still respecting the rate limit, if one of the 100 IDs will cause the whole request to 404? Is there a way to handle this error while still getting the response for the other IDs in the same request? My experiments with "except Valueerror" didn't seem to solve this...
I'd very much appreciate any advice or tips you can give!
while total_request_counter <= request_total:
while list_counter_last <= len(list)+100: #while last group of 100 items is lower or equal than total number of items in list#:
while current_request_counter < 180:
response = twitter.users.lookup(include_entities="false",user_id=",".join(list[list_counter_first:list_counter_last])) #API query for 100 users#
for item in list[list_counter_first:list_counter_last]: #parses twitter IDs in the list by groups of 100 to record results#
try: #necessary for handling errors#
results = str(response)
results = results[results.index("u'screen_name': u'",results.index(item)) + 18:results.index("',",results.index("u'screen_name': u'",results.index(item)) + 18)]#looks for username section in the query output#
text_file = open("output_IDs.txt", "a") #opens current txt output / change path to desired output#
text_file.write(str(item) + "," + results + "\n") #adds twitter ID, info lookup result, and a line skip to the txt output#
text_file.close()
error_counter = error_counter + 1
print str(item) + " = " + str(results)
except: #creates exception to handle errors#
text_file = open("output_IDs.txt", "a") #opens current txt output / change path to desired output#
text_file.write(str(item) + "," + "ERROR " + str(response.headers.get('h')) + "\n") #adds twitter ID, error code, and a line skip to the txt output#
text_file.close()
print str(item) + " = ERROR"
continue
print "Succesfully processed " + str(error_counter) + " users of request number " + str(current_request_counter + 1) + ". " + str(179 - current_request_counter) + " requests left until break." #confirms latest batch processing#
print ""
list_counter_first = list_counter_first + 100 #updates list navigation to move on to next group of 100#
list_counter_last = list_counter_last + 100
error_counter = 0 # resets error counter for the next round#
current_request_counter = current_request_counter + 1 #updates current request counter to stay below rate limit of 180#
t = str(datetime.now())
print ""
print "Taking a break, back in 16 minutes. Approximate time left: " + str((len(list)/18000*15)-(15*total_request_counter)) + " minutes. Current timestamp: " + t
print ""
current_request_counter = 0
total_request_counter = total_request_counter + 1
time.sleep(960) #suspends activities for 16 minutes to respect rate limit#

Is there some kind of limit to the amount of output Python 3.4 allows using the write() method at one time?

I put trailing print() methods right next to my write() method lines at the end of my code to test why my output files were incomplete. But, the print() output is "all the stuff" I expect; while the write() output is off by a confusing amount (only 150 out of 200 'things'). Reference Image of Output: IDLE versus external output file
FYI: Win 7 64 // Python 3.4.2
My modules take an SRT captions file ('test.srt') and returns a list object I create from it; in particular, one with 220 list entries of the form: [[(index), [time], string]]
times = open('times.txt', 'w')
### A portion of Riobard's SRT Parser: srt.py
import re
def tc2ms(tc):
''' convert timecode to millisecond '''
sign = 1
if tc[0] in "+-":
sign = -1 if tc[0] == "-" else 1
tc = tc[1:]
TIMECODE_RE = re.compile('(?:(?:(?:(\d?\d):)?(\d?\d):)?(\d?\d))?(?:[,.](\d?\d?\d))?')
match = TIMECODE_RE.match(tc)
try:
assert match is not None
except AssertionError:
print(tc)
hh,mm,ss,ms = map(lambda x: 0 if x==None else int(x), match.groups())
return ((hh*3600 + mm*60 + ss) * 1000 + ms) * sign
# my code
with open('test.srt') as f:
file = f.read()
srt = []
for line in file:
splitter = file.split("\n\n")
# SRT splitter
i = 0
j = len(splitter)
for items in splitter:
while i <= j - 2:
split_point_1 = splitter[i].index("\n")
split_point_2 = splitter[i].index("\n", split_point_1 + 1)
index = splitter[i][:split_point_1]
time = [splitter[i][split_point_1:split_point_2]]
time = time[0][1:]
string = splitter[i][split_point_2:]
string = string[1:]
list = [[(index), [time], string]]
srt += list
i += 1
# time info outputter
i = 0
j = 1
for line in srt:
if i != len(srt) - 1:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
space_time = ((tc2ms((srt[j][1][0][:indexer]))) - (tc2ms(srt[i][1][0][-indexer:])))/1000
out1 = "The space between Line " + str(i) + " and Line " + str(j) + " lasts " + str(space_time) + " seconds." + "\n"
out2 = "Line " + str(i) + ": " + str(srt[i][2]) + "\n\n"
times.write(out1)
times.write(out2)
print(out1, end="")
print(out2)
i += 1
j += 1
else:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
outend = "Line " + str(i) + ": " + str(srt[i][2]) + "\n<End of File>"
times.write(outend)
print(outend)
My two write() method output files, respectively, only print out either ~150 or ~200 items of the 220 things it otherwise correctly prints to the screen.
You want to close your times file when done writing; operating systems use write buffers to speed up file I/O, collecting larger blocks of data to be written to disk in one go; closing the file flushes that buffer:
times.close()
Consider opening the file in a with block:
with open('times.txt', 'w') as times:
# all code that needs to write to times

Categories