I'm trying to calculate the standard deviation from a bunch of numbers in a document.
Here's what I got so far:
with open("\\Users\\xxx\\python_courses\\1DV501\\assign3\\file_10000integers_B.txt", "r") as f:
total2 = 0
number_of_ints2 = 0
deviation = 0.0
variance = 0.0
for line in f:
for num in line.split(':'):
total2 += int(num)
number_of_ints2 += 1
average = total2/number_of_ints2
for line in f:
for num in line.split(":"):
devation += [(int(num) - average) **2
But I'm completely stuck. I dont know how to do it. Math is not my strong suite so this this is turning out to be quite difficult.
Also the document is mixed with negative and positive numbers if that makes any difference.
You can use a few available libraries, for example if I had data I got from somewhere
>>> import random
>>> data = [random.randint(1,100) for _ in range(100)] # assume from your txt file
I could use statistics.stdev
>>> import statistics
>>> statistics.stdev(data)
28.453646514989956
or numpy.std
>>> import numpy as np
>>> np.std(data)
28.311020822287563
or scipy.stats.tstd
>>> import scipy.stats
>>> scipy.stats.tstd(data)
28.453646514989956
or if you want to roll your own
def stddev(data):
mean = sum(data) / len(data)
return math.sqrt((1/len(data)) * sum((i-mean)**2 for i in data))
>>> stddev(data)
28.311020822287563
Note that the slight difference in computed value will depend on if you want "sample" standard deviation or "population" standard deviation, see here
you may use the function, here is the official documentation :
Set your numbers in a list, then apply your function :
from statistics import stdev
mylist = [1,2,5,10,100]
std = stdev(mylist)
The problem is that you are iterating over the file twice, and you didn't reset the reader to the beginning of the file before the second loop. You can use f.seek(0) to do this.
total2 = 0
number_of_ints2 = 0
deviation = 0.0
variance = 0.0
with open("numbers.txt", "r") as f:
for line in f:
for num in line.split(':'):
total2 += int(num)
number_of_ints2 += 1
average = total2 / number_of_ints2
f.seek(0) # Move back to the beginning of the file.
for line in f:
for num in line.split(":"):
deviation += (int(num) - average) ** 2
Related
I'm trying to plot clusters for my data which is stored in .data file using the density peak clustering algorithm using this code but got killed as the file size is 8 Giga and my Ram is 32. how can I solve this problem, please?
the core problem in loading the whole file by this method
def density_and_distance(self, distance_file, dc = None):
print("Begin")
distance, num, max_dis, min_dis = load_data(distance_file)
print("end")
if dc == None:
dc = auto_select_dc(distance, num, max_dis, min_dis)
rho = local_density(distance, num, dc)
delta, nearest_neighbor = min_distance(distance, num, max_dis, rho)
self.distance = distance
self.rho = rho
self.delta = delta
self.nearest_neighbor = nearest_neighbor
self.num = num
self.dc = dc
return rho, delta
I got Begin word printed then got killed after some minutes
the file contains like
1 2 19.86
1 3 36.66
1 4 87.94
1 5 11.07
1 6 36.94
1 7 52.04
1 8 173.68
1 9 28.10
1 10 74.00
1 11 85.36
1 12 40.04
1 13 95.24
1 14 67.29
....
the method of reading the file is
def load_data(distance_file):
distance = {}
min_dis, max_dis = sys.float_info.max, 0.0
num = 0
with open(distance_file, 'r', encoding = 'utf-8') as infile:
for line in infile:
content = line.strip().split(' ')
assert(len(content) == 3)
idx1, idx2, dis = int(content[0]), int(content[1]), float(content[2])
num = max(num, idx1, idx2)
min_dis = min(min_dis, dis)
max_dis = max(max_dis, dis)
distance[(idx1, idx2)] = dis
distance[(idx2, idx1)] = dis
for i in range(1, num + 1):
distance[(i, i)] = 0.0
infile.close()
return distance, num, max_dis, min_dis
to be
import dask.dataframe as dd
def load_data(distance_file):
distance = {}
min_dis, max_dis = sys.float_info.max, 0.0
num = 0
#with open(distance_file, 'r', encoding = 'utf-8') as infile:
df_dd = dd.read_csv("ex3.csv")
print("df_dd",df_dd.head())
#for line in df_dd:
#content = df_dd.strip().split(' ')
#print(content)
idx1, idx2, dis = df_dd.partitions[0], df_dd.partitions[1], df_dd.partitions[2]
print("df_dd.partitions[0]",df_dd.partitions[0])
num = max(num, idx1, idx2)
min_dis = min(min_dis, dis)
max_dis = max(max_dis, dis)
distance[(idx1, idx2)] = dis
distance[(idx2, idx1)] = dis
for i in range(1, num + 1):
distance[(i, i)] = 0.0
return distance, num, max_dis, min_dis
You are using Python native integers and floats: these alone take tens of bytes for each actual number in your data (28 bytes for an integer).
If you simply use Numpy or Pandas for that, your memory consumption might be slashed by a factor of 4 or more, without further adjustments.
Your lines average 10 bytes this early - at an 8GB file you should have less than 800 million registers - if you use 16bit integer numbers and 32 bit float that would mean that your data might fit in 10GB of memory. It is still a tight call, as the default pandas behavior is to copy everything on changes to a column. There are other options:
Since your code depends on indexing the rows as you've done there, you could just offload your data to an SQLite DB, and use in-sqlite indices instead of the dict you are using, as well as its min and max operators: this would offset memory usage, and sqlite would make its job with minimal fuss.
Another option would be to use "dask" instead of Pandas: it will take care of offloading data that would not fit in memory to disk.
TL;DR: the way your problem is arranged, maybe going to sqlite might be the way that would require less changes in what you have thought.
I am writing a program which solves a function in an interval 0:9 where step size is 0.005. This program requires 1800 calculations and a way to find the max value of a function and x argument which was used.
What would be the recommended way and loops to use in order calculate function 1800 times (9/0.005), find the max value of it and output related argument value which was used in calculation for the max value?
My idea was that there should be 2 lists generated, one for the range/interval (1800 items) and other for calculated values (also 1800). Which would then find max in 'calculated array' and related x argument in the other array, using list index or some other method..
from operator import itemgetter
import math
myfile = open("result.txt", "w")
data = []
step=0.005
rng=9
lim=rng/step
print(lim)
xs=[x * step for x in range(rng)]
lim_int=int(lim)
print(xs)
for i in range(lim_int):
num=itemgetter(i)(xs)
x=math.sin(num)* math.exp(-num/100)
print(i, x)
data.append(x)
for i in range(rng):
text = str(i)
text2 = str(data[i])
print(text, text2)
myfile.write(text + ' ' + text2 + '\n')
i=1
while i < rng:
i=i+1
num2=itemgetter(i)(xs)
v=math.sin(num2)* math.exp(-num2/100)
if v==max(data):
arg=num2
break
print('largest function value', max(data))
print('function argument value used', arg)
myfile.close()
Numpy is the widely used performant package for this:
import numpy as np
x = np.arange(0, 9, 0.005)
f = np.sin(x)*np.exp(-x/100)
print("max is: ", np.max(f))
print("index of max is: ", np.argmax(f))
output:
max is: 0.98446367206362
index of max is: 312
If for some reason you want a native python solution (without using list methods max and index), you can do something like this:
step = 0.005
rng = 9
lim = int(rng/step)
x = [x_i*step for x_i in range(lim + 1)]
f = [math.exp(-x_i/100)*math.sin(x_i) for x_i in x]
max_ind = 0
f_max = f[max_ind]
for j, f_x in enumerate(f):
if f_x > f_max:
f_max = f_x
max_ind = j
I'm trying to calculate the Standard Deviation of all the data thats in the column of "ClosePrices" see the pastebin https://pastebin.com/JtGr672m
We need to calculate one Standard Deviation of all the 1029 floats.
This is my code:
ins1 = open("bijlage.txt", "r")
for line in ins1:
numbers = [(n) for n in number_strings]
i = i + 1
ClosePriceSD = []
ClosePrice = float(data[0][5].replace(',', '.'))
ClosePriceSD.append(ClosePrice)
def sd_calc(data):
n = 1029
if n <= 1:
return 0.0
mean, sd = avg_calc(data), 0.0
# calculate stan. dev.
for el in data:
sd += (float(el) - mean)**2
sd = math.sqrt(sd / float(n-1))
return sd
def avg_calc(ls):
n, mean = len(ls), 0.0
if n <= 1:
return ls[0]
# calculate average
for el in ls:
mean = mean + float(el)
mean = mean / float(n)
return mean
print("Standard Deviation:")
print(sd_calc(ClosePriceSD))
print()
So what I'm trying to calculate is the Standard Deviation of all the floats under the "Closeprices" part.
well I have this "ClosePrice = float(data[0][5].replace(',', '.'))" this should calculate the Standard Deviation from all the floats that are under ClosePrice but it only calculates it from data[0][5]. But I want it to calculate one standard deviation from all the 1029 floats under ClosePrice
I think your error is in the for loop at the beginning. You have for line in ins1 but then you never use line inside the loop. And in your loop you also use number_string and data which are not defined before.
Here is how you can extract the data from you txt file.
with open("bijlage.txt", "r") as ff:
ll = ff.readlines() #extract a list, each element is a line of the file
data = []
for line in ll[1:]: #excluding the first line wich is an header
d = line.split(';')[5] #split each line in a list using semicolon as a separator and keep the element with index 5
data.append(float(d.replace(',', '.'))) #substituting the comma with the dot in the string and convert it to a float
print data #data is a list with all the numbers you want
You should be able to calculate mean and standard deviation from here.
You didn't really specify what the issue/error is. Although this probably doesn't help if it is a school project, you could install scipy, which has a standard deviation function. In this case, just put your array in as a parameter. Could you elaborate on what you're having trouble with? Is the current code giving an error?
Edit:
Looking at the data, you want the 6th element in each line (ClosePrice). If your function is working, and all you need is an array of the ClosedPrice's, this is what I would suggest.
data = []
lines = []
ins1 = open("bijlage.txt", "r")
lines = [lines.rstrip('\n') for line in ins1]
for line in lines:
line.split('\;')
data.append(line[5])
for i in data:
data[i] = float(data[i])
def sd_calc(data):
n = 1029
if n <= 1:
return 0.0
mean, sd = avg_calc(data), 0.0
# calculate stan. dev.
for el in data:
sd += (float(el) - mean)**2
sd = math.sqrt(sd / float(n-1))
return sd
def avg_calc(ls):
n, mean = len(ls), 0.0
if n <= 1:
return ls[0]
# calculate average
for el in ls:
mean = mean + float(el)
mean = mean / float(n)
return mean
print("Standard Deviation:")
print(sd_calc(data))
print()
My program is meant to calculate the standard deviation for 5 values given by the users. There is an issue with my code when getting the input in a for loop. Why is that?
givenValues = []
def average(values):
for x in range(0, 6):
total = total + values[x]
if(x==5):
average = total/x
return average
def sqDiff(values):
totalSqDiff = 0
sqDiff = []
av = average(values)
for x in range(0,6):
sqDiff[x] = (values[x] - av)**2
totalSqDiff = totalSqDiff + sqDiff[x]
avSqDiff = totalSqDiff / 5
SqDiffSquared = avSqDiff**2
return SqDiffSquared
for counter in range(0,6):
givenValues[counter] = float(input("Please enter a value: "))
counter = counter + 1
sqDiffSq = sqDiff(givenValues)
print("The standard deviation for the given values is: " + sqDiffSq)
There are several errors in your code.
Which you can easily find out by reading the errormessages your code produces:
in the Function average
insert the line total = 0
you are using it before asigning it.
List appending
Do not use for example
sqDiff[x] = (values[x] - av)**2
You can do this when using dict's but not lists! Since python cannot be sure that the list indices will be continuously assigned use sqDiff.append(...) instead.
Do not concatenate strings with floats. I recommend to read the PEP 0498
(https://www.python.org/dev/peps/pep-0498/) which gives you an idea on how string could/should be formated in python
I have a huge list (45M+ data poitns), with numerical values:
[78,0,5,150,9000,5,......,25,9,78422...]
I can easily get the maximum and minimum values, the number of these values, and the sum of them:
file_handle=open('huge_data_file.txt','r')
sum_values=0
min_value=None
max_value=None
for i,line in enumerate(file_handle):
value=int(line[:-1])
if min_value==None or value<min_value:
min_value=value
if max_value==None or value>max_value:
max_value=value
sum_values+=value
average_value=float(sum_values)/i
However, this is not what I need. I need a list of 10 numbers, where the number of data points between each two consecutive points is equal, for example
median points [0,30,120,325,912,1570,2522,5002,7025,78422]
and we have the number of data points between 0 and 30 or between 30 and 120 to be almost 4.5 million data points.
How can we do this?
=============================
EDIT:
I am well aware that we will need to sort the data. The problem is that I cannot fit all this data in one variable in memory, but I need to read it sequentially from a generator (file_handle)
If you are happy with an approximation, here is a great (and fairly easy to implement) algorithm for computing quantiles from stream data: "Space-Efficient Online Computation of Quantile Summaries" by Greenwald and Khanna.
The silly numpy approach:
import numpy as np
# example data (produced by numpy but converted to a simple list)
datalist = list(np.random.randint(0, 10000000, 45000000))
# converted back to numpy array (start here with your data)
arr = np.array(datalist)
np.percentile(arr, 10), np.percentile(arr, 20), np.percentile(arr, 30)
# ref:
# http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
You can also hack something together where you just do like:
arr.sort()
# And then select the 10%, 20% etc value, add some check for equal amount of
# numbers within a bin and then calculate the average, excercise for reader :-)
The thing is that calling this function several times will slow it down, so really, just sort the array and then select the elements yourself.
As you said in the comments that you want a solution that can scale to larger datasets then can be stored in RAM, feed the data into an SQLlite3 database. Even if your data set is 10GB and you only have 8GB RAM a SQLlite3 database should still be able to sort the data and give it back to you in order.
The SQLlite3 database gives you a generator over your sorted data.
You might also want to look into going beyond python and take some other database solution.
Here's a pure-python implementation of the partitioned-on-disk sort. It's slow, ugly code, but it works and hopefully each stage is relatively clear (the merge stage is really ugly!).
#!/usr/bin/env python
import os
def get_next_int_from_file(f):
l = f.readline()
if not l:
return None
return int(l.strip())
MAX_SAMPLES_PER_PARTITION = 1000000
PARTITION_FILENAME = "_{}.txt"
# Partition data set
part_id = 0
eof = False
with open("data.txt", "r") as fin:
while not eof:
print "Creating partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
for _ in range(MAX_SAMPLES_PER_PARTITION):
line = fin.readline()
if not line:
eof = True
break
fout.write(line)
part_id += 1
num_partitions = part_id
# Sort each partition
for part_id in range(num_partitions):
print "Reading unsorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "r") as fin:
samples = [int(line.strip()) for line in fin.readlines()]
print "Disk-Deleting unsorted {}".format(part_id)
os.remove(PARTITION_FILENAME.format(part_id))
print "In-memory sorting partition {}".format(part_id)
samples.sort()
print "Writing sorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
fout.writelines(["{}\n".format(sample) for sample in samples])
# Merge-sort the partitions
# NB This is a very inefficient implementation!
print "Merging sorted partitions"
part_files = []
part_next_int = []
num_lines_out = 0
# Setup data structures for the merge
for part_id in range(num_partitions):
fin = open(PARTITION_FILENAME.format(part_id), "r")
next_int = get_next_int_from_file(fin)
if next_int is None:
continue
part_files.append(fin)
part_next_int.append(next_int)
with open("data_sorted.txt", "w") as fout:
while part_files:
# Find the smallest number across all files
min_number = None
min_idx = None
for idx in range(len(part_files)):
if min_number is None or part_next_int[idx] < min_number:
min_number = part_next_int[idx]
min_idx = idx
# Now add that number, and move the relevent file along
fout.write("{}\n".format(min_number))
num_lines_out += 1
if num_lines_out % MAX_SAMPLES_PER_PARTITION == 0:
print "Merged samples: {}".format(num_lines_out)
next_int = get_next_int_from_file(part_files[min_idx])
if next_int is None:
# Remove this partition, it's now finished
del part_files[min_idx:min_idx + 1]
del part_next_int[min_idx:min_idx + 1]
else:
part_next_int[min_idx] = next_int
# Cleanup partition files
for part_id in range(num_partitions):
os.remove(PARTITION_FILENAME.format(part_id))
My code a proposal for finding the result without needing much space. In testing it found a quantile value in 7 minutes 51 seconds for a dataset of size 45 000 000.
from bisect import bisect_left
class data():
def __init__(self, values):
random.shuffle(values)
self.values = values
def __iter__(self):
for i in self.values:
yield i
def __len__(self):
return len(self.values)
def sortedValue(self, percentile):
val = list(self)
val.sort()
num = int(len(self)*percentile)
return val[num]
def init():
numbers = data([x for x in range(1,1000000)])
print(seekPercentile(numbers, 0.1))
print(numbers.sortedValue(0.1))
def seekPercentile(numbers, percentile):
lower, upper = minmax(numbers)
maximum = upper
approx = _approxPercentile(numbers, lower, upper, percentile)
return neighbor(approx, numbers, maximum)
def minmax(list):
minimum = float("inf")
maximum = float("-inf")
for num in list:
if num>maximum:
maximum = num
if num<minimum:
minimum = num
return minimum, maximum
def neighbor(approx, numbers, maximum):
dif = maximum
for num in numbers:
if abs(approx-num)<dif:
result = num
dif = abs(approx-num)
return result
def _approxPercentile(numbers, lower, upper, percentile):
middles = []
less = []
magicNumber = 10000
step = (upper - lower)/magicNumber
less = []
for i in range(1, magicNumber-1):
middles.append(lower + i * step)
less.append(0)
for num in numbers:
index = bisect_left(middles,num)
if index<len(less):
less[index]+= 1
summing = 0
for index, testVal in enumerate(middles):
summing += less[index]
if summing/len(numbers) < percentile:
print(" Change lower from "+str(lower)+" to "+ str(testVal))
lower = testVal
if summing/len(numbers) > percentile:
print(" Change upper from "+str(upper)+" to "+ str(testVal))
upper = testVal
break
precision = 0.01
if (lower+precision)>upper:
return lower
else:
return _approxPercentile(numbers, lower, upper, percentile)
init()
I edited my code a bit and I now think that this way works at least decently even when it's not optimal.