I have multiple lists, the first index of each list are related the second as well so on and so fourth. I need a way of linking the order of these two lists together. so i have a list of teams (some are duplicate) i need an if statement that says: if theres a duplicate of this, then compare this to the duplicate and take the related value in the other list and choose the better one
import sys
import itertools
from itertools import islice
fileLocation = input("Input the file location of ScoreBoard: ")
T = []
N = []
L = []
timestamps = []
teamids = []
problemids = []
inputids = []
scores = []
dictionary = {}
amountOfLines = len(open('input1.txt').readlines())
with open('input1.txt') as input1:
for line in islice(input1, 2, amountOfLines):
parsed = line.strip().split()
timestamps.append(parsed[0])
teamids.append(parsed[1])
problemids.append(parsed[2])
inputids.append(parsed[3])
scores.append(parsed[4])
def checkIfDuplicates(teamids):
''' Check if given list contains any duplicates '''
if len(teamids) == len(set(teamids)):
return False
else:
return True
for i in teamids:
if checkIfDuplicates(i):
dictionary['team%s' % i] = {}
if dictionary < amountOfTeams:
dictionary['team%s' %]
for i in score:
dictionary[teamid][]
print(dictionary)
loop through each list item
delete item if duplicate
for i in list1:
for k in list2:
if i == k:
list.remove(i)
Related
I have a list that holds names of files, some of which are almost identical except for their timestamp string section. The list is in the format of [name-subname-timestamp] for example:
myList = ['name1-001-20211202811.txt', 'name1-001-202112021010.txt', 'name1-002-202112021010.txt', 'name2-002-202112020811.txt']
What I need is a list that holds for every name and subname, the most recent file derived by the timestamp. I have started by creating a list that holds every [name-subname]:
name_subname_list = []
for row in myList:
name_subname_list.append((row.rpartition('-')[0]))
name_subname_list = set(name_subname_list) # {'name1-001', 'name2-002', 'name1-002'}
Not sure if it is the right approach, moreover I am not sure how to continue. Any ideas?
This code is what you asked for:
For each name-subname, you will have the corresponding newest file:
from datetime import datetime as dt
dic = {}
for i in myList:
sp = i.split('-')
name_subname = sp[0]+'-'+sp[1]
mytime = sp[2].split('.')[0]
if name_subname not in dic:
dic[name_subname] = mytime
else:
if dt.strptime(mytime, "%Y%m%d%H%M") > dt.strptime(dic[name_subname], "%Y%m%d%H%M"):
dic[name_subname] = mytime
result = []
for name_subname in dic:
result.append(name_subname+'-'+dic[name_subname]+'.txt')
which out puts resutl to be like:
['name1-001-202112021010.txt',
'name1-002-202112021010.txt',
'name2-002-202112020811.txt']
Try this:
myList = ['name1-001-20211202811.txt', 'name1-001-202112021010.txt', 'name1-002-202112021010.txt', 'name2-002-202112020811.txt']
dic = {}
for name in myList:
parts = name.split('-')
dic.setdefault(parts[0] + '-' + parts[1], []).append(parts[2])
unique_list = []
for key,value in dic.items():
unique_list.append(key + '-' + max(value))
So I have this list and a function that calculates the scores of my teams. i then put the team name and the score in a separate dictionary but the problem is that i have a few duplicate teams in this list. theres a second item which is whether or not the team response was valid if the result was this: team1 - score 100 - validresponse 0 i just want to get rid of the team even if its a duplicate, however of theres two duplicates of the SAME team and both their submissions were valid then i want to add their scores together and set it as one thing in the dictionary. the only problem is that when doing this, the dictionary automatically disregards the other duplicates.
Here's my code:
import numpy as np
import pandas as pd
mylist = []
with open("input1.txt", "r") as input:
for line in input:
items = line.split()
mylist.append([int(item) for item in items[0:]])
amountOfTestCases = mylist[0][0]
amountOfTeams = mylist[1][0]
amountOfLogs = mylist[1][1]
count = 1
count2 = 1
mydict = {}
teamlist = []
for i in mylist[2:]:
count2 += 1
teamlist.append(mylist[count2][1])
def find_repeating(lst, count=2):
ret = []
counts = [None] * len(lst)
for i in lst:
if counts[i] is None:
counts[i] = i
elif i == counts[i]:
ret += [i]
if len(ret) == count:
return ret
rep_indexes = np.where(pd.DataFrame(teamlist).duplicated(keep=False))
print(teamlist)
print(rep_indexes)
duplicate = find_repeating(teamlist)
def calculate_points(row):
points = mylist[row][3] * 100
points -= mylist[row][0]
return points
for i in teamlist:
count += 1
mydict['team%s' % mylist[count][1]] = calculate_points(count)
print(mydict)
the teamlist = [5, 4, 1, 2, 5, 4]
validresponse 0 i just want to get rid of the team even if its a duplicate
check if the response is valid
if invalid continue without doing anything else
duplicates of the SAME team and both their submissions were valid then i want to add their scores together
check if the key/team already exists (a duplicate)
if it exists
get its value
add the new value
assign the result to that dictionary key
if it is not a duplicate
make a new key with that value
Problem is to return the name of the event that has the highest number of participants in this text file:
#Beyond the Imposter Syndrome
32 students
4 faculty
10 industries
#Diversifying Computing Panel
15 students
20 faculty
#Movie Night
52 students
So I figured I had to split it into a dictionary with the keys as the event names and the values as the sum of the integers at the beginning of the other lines. I'm having a lot of trouble and I think I'm making it too complicated than it is.
This is what I have so far:
def most_attended(fname):
'''(str: filename, )'''
d = {}
f = open(fname)
lines = f.read().split(' \n')
print lines
indexes = []
count = 0
for i in range(len(lines)):
if lines[i].startswith('#'):
event = lines[i].strip('#').strip()
if event not in d:
d[event] = []
print d
indexes.append(i)
print indexes
if not lines[i].startswith('#') and indexes !=0:
num = lines[i].strip().split()[0]
print num
if num not in d[len(d)-1]:
d[len(d)-1] += [num]
print d
f.close()
import sys
from collections import defaultdict
from operator import itemgetter
def load_data(file_name):
events = defaultdict(int)
current_event = None
for line in open(file_name):
if line.startswith('#'):
current_event = line[1:].strip()
else:
participants_count = int(line.split()[0])
events[current_event] += participants_count
return events
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage:\n\t{} <file>\n'.format(sys.argv[0]))
else:
events = load_data(sys.argv[1])
print('{}: {}'.format(*max(events.items(), key=itemgetter(1))))
Here's how I would do it.
with open("test.txt", "r") as f:
docText = f.read()
eventsList = []
#start at one because we don't want what's before the first #
for item in docText.split("#")[1:]:
individualLines = item.split("\n")
#get the sum by finding everything after the name, name is the first line here
sumPeople = 0
#we don't want the title
for line in individualLines[1:]:
if not line == "":
sumPeople += int(line.split(" ")[0]) #add everything before the first space to the sum
#add to the list a tuple with (eventname, numpeopleatevent)
eventsList.append((individualLines[0], sumPeople))
#get the item in the list with the max number of people
print(max(eventsList, key=lambda x: x[1]))
Essentially you first want to split up the document by #, ignoring the first item because that's always going to be empty. Now you have a list of events. Now for each event you have to go through, and for every additional line in that event (except the first) you have to add that lines value to the sum. Then you create a list of tuples like (eventname) (numPeopleAtEvent). Finally you use max() to get the item with the maximum number of people.
This code prints ('Movie Night', 104) obviously you can format it to however you like
Similar answers to the ones above.
result = {} # store the results
current_key = None # placeholder to hold the current_key
for line in lines:
# find what event we are currently stripping data for
# if this line doesnt start with '#', we can assume that its going to be info for the last seen event
if line.startswith("#"):
current_key = line[1:]
result[current_key] = 0
elif current_key:
# pull the number out of the string
number = [int(s) for s in line.split() if s.isdigit()]
# make sure we actually got a number in the line
if len(number) > 0:
result[current_key] = result[current_key] + number[0]
print(max(result, key=lambda x: x[1]))
This will print "Movie Night".
Your problem description says that you want to find the event with highest number of participants. I tried a solution which does not use list or dictionary.
Ps: I am new to Python.
bigEventName = ""
participants = 0
curEventName = ""
curEventParticipants = 0
# Use RegEx to split the file by lines
itr = re.finditer("^([#\w+].*)$", lines, flags = re.MULTILINE)
for m in itr:
if m.group(1).startswith("#"):
# Whenever a new group is encountered, check if the previous sum of
# participants is more than the recent event. If so, save the results.
if curEventParticipants > participants:
participants = curEventParticipants
bigEventName = curEventName
# Reset the current event name and sum as 0
curEventName = m.group(1)[1:]
curEventParticipants = 0
elif re.match("(\d+) .*", m.group(1)):
# If it is line which starts with number, extract the number and sum it
curEventParticipants += int(re.search("(\d+) .*", m.group(1)).group(1))
# This nasty code is needed to take care of the last event
bigEventName = curEventName if curEventParticipants > participants else bigEventName
# Here is the answer
print("Event: ", bigEventName)
You can do it without a dictionary and maybe make it a little simpler if just using lists:
with open('myfile.txt', 'r') as f:
lines = f.readlines()
lines = [l.strip() for l in lines if l[0] != '#'] # remove comment lines and '\n'
highest = 0
event = ""
for l in lines:
l = l.split()
if int(l[0]) > highest:
highest = int(l[0])
event = l[1]
print (event)
I have created two CSV lists. One is an original CSV file, the other is a DeDuped version of that file. I have read each into a list and for all intents and purposes they are the same format. Each list item is a string.
I am trying to use a list comprehension to find out which items were deleted by the duplication. The length of the original is 16939 and the list of the DeDupe is 15368. That's a difference of 1571, but my list comprehension length is 368. Ideas?
deduped = open('account_de_ex.csv', 'r')
deduped_data = deduped.read()
deduped.close()
deduped = deduped_data.split("\r")
#read in file with just the account names from the full account list
account_names = open('account_names.csv', 'r')
account_data = account_names.read()
account_names.close()
account_names = account_data.split("\r")
# Get all the accounts that were deleted in the dedupe - i.e. get the duplicate accounts
dupes = [ele for ele in account_names if ele not in deduped]
Edit: For some notes in the comments, here is a test on my list comp and the lists themselves. Pretty much the same difference, 20 or so off. Not the 1500 i need! thanks!
print len(deduped)
deduped = set(deduped)
print len(deduped)
print len(account_names)
account_names = set(account_names)
print len(account_names)
15368
15368
16939
15387
Try running this code and see what it reports. This requires Python 2.7 or newer for collections.Counter but you could easily write your own counter code, or copy my example code from another answer: Python : List of dict, if exists increment a dict value, if not append a new dict
from collections import Counter
# read in original records
with open("account_names.csv", "rt") as f:
rows = sorted(line.strip() for line in f)
# count how many times each row appears
counts = Counter(rows)
# get a list of tuples of (count, row) that only includes count > 1
dups = [(count, row) for row, count in counts.items() if count > 1]
dup_count = sum(count-1 for count in counts.values() if count > 1)
# sort the list from largest number of dups to least
dups.sort(reverse=True)
# print a report showing how many dups
for count, row in dups:
print("{}\t{}".format(count, row))
# get de-duped list
unique_rows = sorted(counts)
# read in de-duped list
with open("account_de_ex.csv", "rt") as f:
de_duped = sorted(line.strip() for line in f)
print("List lengths: rows {}, uniques {}/de_duped {}, result {}".format(
len(rows), len(unique_rows), len(de_duped), len(de_duped) + dup_count))
# lists should match since we sorted both lists
if unique_rows == de_duped:
print("perfect match!")
else:
# if lists don't match, find out what is going on
uniques_set = set(unique_rows)
deduped_set = set(de_duped)
# find intersection of the two sets
x = uniques_set.intersection(deduped_set)
# print differences
if x != uniques_set:
print("Rows in original that are not in deduped:\n{}".format(sorted(uniques_set - x)))
if x != deduped_set:
print("Rows in deduped that are not in original:\n{}".format(sorted(deduped_set - x)))
To see what you really have in each list you can proceed by construction :
If you only had unique elements :
deduped = range(15368)
account_names2 = range(15387)
dupes2 = [ele for ele in account_names2 if ele not in deduped] #len is 19
However because you have repetitions of removed and not removed elements you actually end up with :
account_names =account_names2 + dupes2*18 + dupes2[:7] + account_names2[:1571 - 368]
dupes = [ele for ele in account_names if ele not in deduped] # dupes will have 368 elements
I have an 2 dimensional array. Each of the row vectors, in this case, is considered a quantity of interest. What I want to do is return all the rows that appear exactly once as one array, and all the rows that appear more than once as a second array.
For example, if the array was:
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
I would like to return two arrays:
nonsingles=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [5,1,6,0]]
singles= [[3,2,1,0], [4,4,1,0]]
It is important that the order stay preserved. The code I have written to do this is as follows:
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
arrayhash=map(tuple, array)
for x in arrayhash:
if (arrayhash.count(x)==1):
singles.append(x)
if (arrayhash.count(x)>1):
nonsingles.append(x)
nonsingles=array(nonsingles)
singles=array(singles)
return {'singles':singles, 'nonsingles':nonsingles}
Now, I am happy to say that this works, but unhappy to say that it is extremely slow, as a typical array i have is 30000(rows)x10 elements/row=300000 elements. Can anyone give me some tips about how to speed this up?? I apologize if this question is very simple, I am new to Python. Also, I am using Numpy/Scipy with Python 2.7, if that is any help.
In Python 2.7 or above, you can use collections.Counter to count the number of occurrences:
def unique_items(iterable):
tuples = map(tuple, iterable)
counts = collections.Counter(tuples)
unique = []
non_unique = []
for t in tuples:
if counts[t] == 1:
unique.append(t)
else:
non_unique.append(t)
return unique, non_unique
I think your problem is that you are doing an in test on a list. This has O(n) performance.
It should be faster to build a dict and then use that to figure out what to do with each row.
EDIT: The code had an unnecessary enumerate() in it; I stripped it out.
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
return {'singles':singles, 'nonsingles':nonsingles}
Here's a version that only returns unique rows:
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
already_seen = set()
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if row in already_seen:
continue
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
already_seen.add(row)
return {'singles':singles, 'nonsingles':nonsingles}
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
x = singles_nonsingles(a)
print("Array: " + str(a))
print(x)
The first return only the list of the single/no single arrays without repetitions, the second with repetitions
def comp (multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] > 1:
no_singles.append(list(k))
elif res[k] == 1:
singles.append(list(k))
return singles, no_singles
def count_w_repetitions(multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] == 1:
singles.append(list(k))
else:
for i in xrange(res[k]):
no_singles.append(list(k))
return singles, no_singles
from itertools import compress,imap
def has_all_unique(a):
return len(a) == len(frozenset(a))
uniq = map( has_all_unique,a)
singles = list(compress(a,uniq))
notuniq = imap(lambda x: not x,uniq)
nonsingles = list(compress(a,notuniq))