I am trying to do the following: identify if there is a 'NA' value in a nested list, and if so, to replace it with the average value of the sum of the other elements of the list. The elements of the lists should be floats. For example:
[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
should return
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
The code below, albeit long and redundant, works:
def convert_data(data):
first = []
second = []
third = []
fourth = []
count = 0
for i in data:
for y in i:
if 'NA' not in i:
y = float(y)
first.append(y)
elif 'NA' in i:
a = i.index('NA')
second.append(y)
second[a] = 0
for q in second:
q = float(q)
third.append(q)
count+= q
length = len(third)
count = count/(length-1)
third[a] = count
fourth.extend([first,third])
return fourth
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
for example:
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
returns the desired output:
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
but if the 'NA' is in the first list e.g.
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
then it doesn't. Can someone please explain how to fix this?
data_var = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
def replace_na_with_mean(list_entry):
for i in range(len(list_entry)):
index_list = []
m = 0
while 'NA' in list_entry[i]:
index_list.append(list_entry[i].index('NA') + m)
del list_entry[i][list_entry[i].index('NA')]
if list_entry[i]:
for n in range(len(list_entry[i])):
list_entry[i][n] = float(list_entry[i][n])
if index_list:
if list_entry[i]:
avg = sum(list_entry[i]) / len(list_entry[i])
else:
avg = 0
for l in index_list:
list_entry[i].insert(l, avg)
return list_entry
print(replace_na_with_mean(data_var))
I'd suggest to use pandas functionality, since these types of operations are exactly what pandas was developed for. One can simply achieve what you want in just few lines of code:
import pandas as pd
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
df = pd.DataFrame(data).T.replace("NA", pd.np.nan).astype('<f8')
res = df.fillna(df.mean()).T.values.tolist()
which returns the wanted output:
[[1.2, 0.7, 0.2], [44.0, 67.0, 90.0]]
Btw your code works for me just fine in this simple case:
convert_data(data)
> [[44.0, 67.0, 90.0], [1.2, 0.7, 0.2]]
It will definitely start failing or giving faulty results in more complicated cases, f.e. if you have more than 1 "NA" value in the nested list, you will get ValueError exception (you will be trying to convert string into float).
This should do the trick, using numpy:
import numpy as np
x=[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
#convert to float
x=np.char.replace(np.array(x), "NA", "nan").astype(np.float)
#replace nan-s with mean
mask=x.astype(str)=="nan"
x[mask]=np.nanmean(x, axis=1)[mask.any(axis=1)]
Output:
[[ 1.2 3.1 0.2]
[44. 67. 90. ]]
One reason why your code ended up a little overcomplicated is that you tried to start by solving the problem of a "nested list." But really, all you need is a function that processes a list of numeric strings with some "NA" values, and then you can just apply that function to every item in the list.
def float_or_average(list_of_num_strings):
# First, convert every item that you can to a number. You need to do this
# before you can handle even ONE "NA" value, because the "NA" values need
# to be replaced with the average of all the numbers in the collection.
# So for now, convert ["1.2", "NA", "2.0"] to [1.2, "NA", 2.0]
parsed = []
# While we're at it, let's record the sum of the floats and their count,
# so that we can compute that average.
numeric_sum = 0.0
numeric_count = 0
for item in list_of_num_strings:
if item == "NA":
parsed.append(item)
else:
floating_point_value = float(item)
parsed.append(floating_point_value)
numeric_sum += floating_point_value
numeric_count += 1
# Now we can calculate the average:
average = numeric_sum / numeric_count
# And replace the "NA" values with them.
for i, item in enumerate(parsed):
if item == "NA":
parsed[i] == average
return parsed
# Or, with a list comprehension (replacing the previous four lines of
# code):
return [number if number != "NA" else average for number in parsed]
# Using this function on a nested list is as easy as
example_data = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
parsed_nested_list = []
for sublist in example_data:
parsed_nested_list.append(float_or_average(sublist))
# Or, using a list comprehension (replacing the previous three lines of code):
parsed_nested_list = [float_or_average(sublist) for sublist in example_data]
def convert_data(data):
for lst in data:
sum = 0
index_na = list()
for elem in range(len(lst)):
if lst[elem] != 'NA':
sum += float(lst[elem])
lst[elem] = float(lst[elem])
else:
index_na.append(elem)
if len(index_na) > 0:
len_values = sum / (len(lst)-len(index_na))
for i in index_na:
lst[i] = float("{0:.2f}".format(len_values))
return data
Related
I am trying to write a program that iterates through data values and adds them to a dictionary (from a csv file), while giving a running total of how many times that data value appears in the list of values I have. I am able to do this but I need to add a range(not the range func.), for example if current value is within + or - .50 of another then it'll take the average and add another or the running total.
data = {}
file = open(fname)
#Create value dictionary, add running count to repeated values
for line in file:
rows = line.split(",")
for i in range(4):
price = rows[i]
price = float(price)
newnum = price
data[price] = data.get(price, 0) + 1
#Get top 10 most common values
top_dogs = {}
for i in range(10):
key = max(data, key=data.get)
value = data.pop(key)
top_dogs[key] = value
print(top_dogs)
In general, dicts don't have a capability for matching ranges, so you either need to collapse the range to a single value and use another data structure such as a sorted list.
As example of the first technique, the round()` function will suffice will suffice for finding values with "+ or - .50" of one another:
data = [10.1, 11.2, 10.5, 12.5, 10.2, 12.6, 11.4, 11.7, 11.8]
d = {}
for x in data:
k = round(x)
d[k] = d.get(k, 0) + 1
For the second technique, you can maintain a sorted list with the bisect module which is good at searching ranges and maintaining search order.
from statistics import mean
from bisect import bisect_left, bisect_right, insort
data = [10.1, 11.2, 10.5, 12.5, 10.2, 12.6, 11.4, 11.7, 11.8]
d = {}
sorted_list = []
for x in data:
lo = bisect_left(sorted_list, x - 0.5)
hi = bisect_right(sorted_list, x + 0.5)
if lo == hi:
new_x = x
new_count = 1
else:
old_x = sorted_list.pop(lo)
new_x = mean([old_x, x])
new_count = d.pop(old_x) + 1
d[new_x] = new_count
insort(sorted_list, new_x)
Note 1: This code can be tweaked further so that if multiple values are in the lo:hi range, the closest one to x can be updated. For example, if the sorted_list contained [10.1, 10.8], both values are within 0.50 of 10.5, but 10.8 should be selected for update because it is closer to 10.5.
Note 2: The request to average the inputs likely isn't the right thing to do because it weights the most recently seen input more than the earlier inputs. A better result can be had by keeping a list of all nearby inputs and then averaging them at the end.
Note 3: Rather than the algorithm as requested, it may be better to sort all the inputs, then scan for blocks where all values lie in a specified interval:
from statistics import mean
data = [10.1, 11.2, 10.5, 12.5, 10.2, 12.6, 11.4, 11.7, 11.8]
data.sort()
d = {}
equivalents = []
for x in data:
if not equivalents or x < equivalents[0] + 1.0:
equivalents.append(x)
else:
d[mean(equivalents)] = len(equivalents)
equivalents.clear()
if equivalents:
d[mean(equivalents)] = len(equivalents)
equivalents.clear()
** I modified the entire question **
I have an example list specified below and i want to find if 2 values are from the same list and i wanna know which list both the value comes from.
list1 = ['a','b','c','d','e']
list2 = ['f','g','h','i','j']
c = 'b'
d = 'e'
i used for loop to check whether the values exist in the list however not sure how to obtain which list the value actually is from.
for x,y in zip(list1,list2):
if c and d in x or y:
print(True)
Please advise if there is any work around.
First u might want to inspect the distribution of values and sizes where you can improve the result with the least effort like this:
df_inspect = df.copy()
df_inspect["size.value"] = ["size.value"].map(lambda x: ''.join(y.upper() for y in x if x.isalpha() if y != ' '))
df_inspect = df_inspect.groupby(["size.value"]).count().sort_values(ascending=False)
Then create a solution for the most occuring size category, here "Wide"
long = "adasda, 9.5 W US"
short = "9.5 Wide"
def get_intersection(s1, s2):
res = ''
l_s1 = len(s1)
for i in range(l_s1):
for j in range(i + 1, l_s1):
t = s1[i:j]
if t in s2 and len(t) > len(res):
res = t
return res
print(len(get_intersection(long, short)) / len(short) >= 0.6)
Then apply the solution to the dataframe
df["defective_attributes"] = df.apply(lambda x: len(get_intersection(x["item_name.value"], x["size.value"])) / len(x["size.value"]) >= 0.6)
Basically, get_intersection search for the longest intersection between the itemname and the size. Then takes the length of the intersection and says, its not defective if at least 60% of the size_value are also in the item_name.
I want to add my items values on the existing f['ECPM_medio'] column
I made some modifications on the items values to have 0.8 to 0.9values of each numbers. the problem is when I try to add these new numbers to the existing column... I paste the same number on all rows!
import pandas as pd
jf = pd.read_csv("Cliente_x_Pais_Sitio.csv", header=0, sep = ",")
del jf['Fill_rate']
del jf['Importe_a_pagar_a_medio']
a = jf.sort_values(by=["Cliente","Auth_domain","Sitio",'Country'])
f = a.groupby(["Cliente","Auth_domain","Sitio","Country"], as_index=False)['ECPM_medio'].min()
del a['Fecha']
del a['Subastas']
del a['Impresiones_exchange']
f.to_csv('Recom_Sitios.csv', index=False)
for item in f['ECPM_medio']:
item = float(item)
if item <= 0.5:
item = item * 0.8
else:
item = item * 0.9
item = float("{0:.2f}".format(item))
item
for item in item:
f['ECPM_medio'] = item
f.to_csv('Recom_Sitios22.csv', index=False)
It seems to me that you could also do something like this:
f.loc[:, 'ECPM_medio'] = (f['ECPM_medio'] * \
np.where(f['ECPM_medio'] <= 0.5, .8, .9)).round(2)
np.where(f['ECPM_medio'] <= 0.5, .8, .9) returns an array the length of your ECPM_medio column with values .8 or .9, depending on the same-indexed value in f['ECPM_medio']. You can then multiply your DataFrame column by this array, and wrap the whole expression in parentheses so that you can take the resulting Series (i.e. your transformed f['ECPM_medio'] column), and tack on .round(2) to round the column's values to two places.
You should create a function and then apply it with lambda.
Example:
def myfunc(item):
item = float(item)
if item <= 0.5:
item = item * 0.8
else:
item = item * 0.9
item = float("{0:.2f}".format(item))
return item
f['ECPM_medio'] = f['ECPM_medio'].apply(lambda x: myfunc(x))
You can do this using Pandas vectorized operations,
df['ECPM_medio'] = np.where(df['ECPM_medio'] <= 0.5, df['ECPM_medio'] * 0.8, df['ECPM_medio']* 0.9)
I have used the module intervals (http://pyinterval.readthedocs.io/en/latest/index.html)
And created an interval from a set or start, end tuples:
intervals = interval.interval([1,8], [7,10], [15,20])
Which result in interval([1.0, 10.0], [15.0, 20.0]) as the [1,8] and [7,10] overlaps.
But this module interprets the values of the pairs as real numbers, so two continuous intervals in integers will not be joined together.
Example:
intervals = interval.interval([1,8], [9,10], [11,20])
results in: interval([1.0, 8.0], [9.0, 10.0], [11.0, 20.0])
My question is how can I join this intervals as integers and not as real numbers? And in the last example the result would be interval([1.0, 20.0])
The intervals module pyinterval is used for real numbers, not for integers. If you want to use objects, you can create an integer interval class or you can also code a program to join integer intervals using the interval module:
def join_int_intervlas(int1, int2):
if int(int1[-1][-1])+1 >= int(int2[-1][0]):
return interval.interval([int1[-1][0], int2[-1][-1]])
else:
return interval.interval()
I believe you can use pyintervals for integer intervals too by adding interval([-0.5, 0.5]). With your example you get
In[40]: interval([1,8], [9,10], [11,20]) + interval([-0.5, 0.5])
Out[40]: interval([0.5, 20.5])
This takes a list of tuples like l = [(25,24), (17,18), (5,9), (24,16), (10,13), (15,19), (22,25)]
# Idea by Ben Voigt in https://stackoverflow.com/questions/32869247/a-container-for-integer-intervals-such-as-rangeset-for-c
def sort_condense(ivs):
if len(ivs) == 0:
return []
if len(ivs) == 1:
if ivs[0][0] > ivs[0][1]:
return [(ivs[0][1], ivs[0][0])]
else:
return ivs
eps = []
for iv in ivs:
ivl = min(iv)
ivr = max(iv)
eps.append((ivl, False))
eps.append((ivr, True))
eps.sort()
ret = []
level = 0
i = 0
while i < len(eps)-1:
if not eps[i][1]:
level = level+1
if level == 1:
left = eps[i][0]
else:
if level == 1:
if not eps[i+1][1]
and eps[i+1][0] == eps[i][0]+1:
i = i+2
continue
right = eps[i][0]
ret.append((left, right))
level = level-1
i = i+1
ret.append((left, eps[len(eps)-1][0]))
return ret
In [1]: sort_condense(l)
Out[1]: [(5, 13), (15, 25)]
The idea is outlined in Ben Voigt's answer to A container for integer intervals, such as RangeSet, for C++
Python is not my main language, sorry.
I came up with the following program:
ls = [[1,8], [7,10], [15,20]]
ls2 = []
prevList = ls[0]
for lists in ls[1:]:
if lists[0] <= prevList[1]+1:
prevList = [prevList[0], lists[1]]
else:
ls2.append(prevList)
prevList = lists
ls2.append(prevList)
print ls2 # prints [[1, 10], [15, 20]]
It permutes through all lists and checks if the firsy element of each list is less than or equal to the previous element + 1. If so, it clubs the two.
As an example my list is:
[25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866,
19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154,
13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152,
6.983734, 6.048035, 5.505096, 4.65799]
and I'm looking for the index of the value closest to 11.5. I've tried other methods such as binary search and bisect_left but they don't work.
I cannot sort this array, because the index of the value will be used on a similar array to fetch the value at that index.
Try the following:
min(range(len(a)), key=lambda i: abs(a[i]-11.5))
For example:
>>> a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866, 19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154, 13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152, 6.983734, 6.048035, 5.505096, 4.65799]
>>> min(range(len(a)), key=lambda i: abs(a[i]-11.5))
16
Or to get the index and the value:
>>> min(enumerate(a), key=lambda x: abs(x[1]-11.5))
(16, 11.33447)
import numpy as np
a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866, 19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154, 13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152, 6.983734, 6.048035, 5.505096, 4.65799]
index = np.argmin(np.abs(np.array(a)-11.5))
a[index] # here is your result
In case a is already an array, the corresponding transformation can be ommitted.
How about: you zip the two lists, then sort the result?
If you can't sort the array, then there is no quick way to find the closest item - you have to iterate over all entries.
There is a workaround but it's quite a bit of work: Write a sort algorithm which sorts the array and (at the same time) updates a second array which tells you where this entry was before the array was sorted.
That way, you can use binary search to look up index of the closest entry and then use this index to look up the original index using the "index array".
[EDIT] Using zip(), this is pretty simple to achieve:
array_to_sort = zip( original_array, range(len(original_array)) )
array_to_sort.sort( key=i:i[0] )
Now you can binary search for the value (using item[0]). item[1] will give you the original index.
Going through all the items is only linear. If you would sort the array that would be worse.
I don't see a problem on keeping an additional deltax (the min difference so far) and idx (the index of that element) and just loop once trough the list.
Keep in mind that if space isn't important you can sort any list without moving the contents by creating a secondary list of the sorted indices.
Also bear in mind that if you are doing this look up just once, then you will just have to traverse every element in the list O(n). (If multiple times then you probably would want to sort for increase efficiency later)
If you are searching a long list a lot of times, then min scales very bad (O(n^2), if you append some of your searches to the search list, I think).
Bisect is your friend. Here's my solution. It scales O(n*log(n)):
class Closest:
"""Assumes *no* redundant entries - all inputs must be unique"""
def __init__(self, numlist=None, firstdistance=0):
if numlist == None:
numlist=[]
self.numindexes = dict((val, n) for n, val in enumerate(numlist))
self.nums = sorted(self.numindexes)
self.firstdistance = firstdistance
def append(self, num):
if num in self.numindexes:
raise ValueError("Cannot append '%s' it is already used" % str(num))
self.numindexes[num] = len(self.nums)
bisect.insort(self.nums, num)
def rank(self, target):
rank = bisect.bisect(self.nums, target)
if rank == 0:
pass
elif len(self.nums) == rank:
rank -= 1
else:
dist1 = target - self.nums[rank - 1]
dist2 = self.nums[rank] - target
if dist1 < dist2:
rank -= 1
return rank
def closest(self, target):
try:
return self.numindexes[self.nums[self.rank(target)]]
except IndexError:
return 0
def distance(self, target):
rank = self.rank(target)
try:
dist = abs(self.nums[rank] - target)
except IndexError:
dist = self.firstdistance
return dist
Use it like this:
a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866,
19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154,
13.09409, 12.18347, 1.33447, 10.32184, 9.544922, 8.813385, 8.181152,
6.983734, 6.048035, 5.505096, 4.65799]
targets = [1.0, 100.0, 15.0, 15.6, 8.0]
cl = Closest(a)
for x in targets:
rank = cl.rank(x)
print("Closest to %5.1f : rank=%2i num=%8.5f index=%2i " % (x, rank,
cl.nums[rank], cl.closest(x)))
Will output:
Closest to 1.0 : rank= 0 num= 1.33447 index=16
Closest to 100.0 : rank=25 num=26.78030 index= 1
Closest to 15.0 : rank=12 num=14.79059 index=12
Closest to 15.6 : rank=13 num=15.71255 index=11
Closest to 8.0 : rank= 5 num= 8.18115 index=20
And:
cl.append(99.9)
x = 100.0
rank = cl.rank(x)
print("Closest to %5.1f : rank=%2i num=%8.5f index=%2i " % (x, rank,
cl.nums[rank], cl.closest(x)))
Output:
Closest to 100.0 : rank=25 num=99.90000 index=25