How do I create a function to find average above median? - python

I want to find the average of all scores above and below the median (not including the median), but I have no idea have to go about doing this.
import collections
def main():
names = ["gymnastics_school", "participant_name", "all_around_points_earned"]
Data = collections.namedtuple("Data", names)
data = []
values =[]
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
data.append(Data(*items))
values.append(items[2])
print("summary of data:")
sorted_data = sorted (values)
if len(data)%2==0:
a =sorted_data[len(values)//2]
b = sorted_data[len(values)//2-1]
median_val = (a+b)//2
else:
median_val = sorted_data[(len(values)-1)//2]
print(" median score",median_val) #median

We now have statistics as part of the standard library:
import statistics
nums = list(range(10))
med = statistics.median(nums)
hi_avg = statistics.mean(i for i in nums if i > med)
lo_avg = statistics.mean(i for i in nums if i < med)

You can use the build-in function filter and sum. For example
above_med = filter(lambda x: x>median_val, values)
print(" average of scores above median ", sum(above_med)/len(above_med))
Edited:
As suggested by #ChrisP, you can also use the standard package statistics introduced since python 3.4.

Here is an example:
import numpy as np
data_array = np.array(data)
med = np.median(data)
ave_above_med = data_array[data_array > med].mean()
so the function would be:
import numpy as np
def average_above_med(data):
data_array = np.array(data)
med = np.median(data)
response = data_array[data_array > med].mean()
return response
This is a test of that:
test_data = [1, 5, 66, 7, 5]
print(average_above_med(test_data))
which displays:
36.5
Hope this helps.

Related

How to understand what is wrong with my Python loop code

I have written this code to extract only digits from a text file and then calculate sum of those values extracted . But I am getting 0 as answer which should 285701 in actual. I don't understand what I am doing wrong even after working on it for long, I am not very experienced in programming just started learning.
import re
fname = open("http://py4e-data.dr-chuck.net/regex_sum_1501185.txt")
sum = 0
value = list()
for line in fname:
line = re.findall("[0-9]+", line)
value = value + line
for x in value:
sum = sum + int(x)
print(sum)
You can't open web urls with open() you need to use urllib.request.urlopen():
import urllib.request
import re
fname = urllib.request.urlopen("http://py4e-data.dr-chuck.net/regex_sum_1501185.txt")
data = fname.read().decode()
data = data.split('\n')
sum = 0
value = list()
for line in data:
nums = re.findall("[0-9]+", line)
value = value + nums
for x in value:
sum = sum + int(x)
print(sum)
Output:
285701
You need to be careful with your variable names naming your variable sum causes that you won't be able to use the builtin function sum()
It would be better if your code looks like that:
import urllib.request
import re
fname = urllib.request.urlopen("http://py4e-data.dr-chuck.net/regex_sum_1501185.txt")
data = fname.read(50000).decode()
data = data.split('\n')
value = list()
for line in data:
line = re.findall("[0-9]+", line)
value = value + [int(i) for i in line]
print(sum(value))
Docs

Trying to optimize quicksort for larger files

Does anyone know how I can optimize this code better to run larger files. It works with smaller inputs, but I need it to run a file with over 200,000 words. Any suggestions?
Thank you.
import random
import re
def quick_sort(a,i,n):
if n <= 1:
return
mid = (len(a)) // 2
x = a[random.randint(0,len(a)-1)]
p = i - 1
j = i
q = i + n
while j < q:
if a[j] < x:
p = p + 1
a[j],a[p] = a[p],a[j]
j = j + 1
elif a[j] > x:
q = q - 1
a[j],a[q] = a[q],a[j]
else:
j = j + 1
quick_sort(a,i,p-i+1)
quick_sort(a,q,n-(q-i))
file_name = input("Enter file name: ")
my_list = []
with open(file_name,'r') as f:
for line in f:
line = re.sub('[!#?,.:";\']', '', line).lower()
token = line.split()
for t in token:
my_list.append(t)
a = my_list
quick_sort(a,0,len(my_list))
print("List After Calling Quick Sort: ",a)
Your random selection of an index to use for your pivot x is using the whole size of the input list a, not just the part you're supposed to be sorting on the current call. This means that very often your pivot won't be in the current section at all, and so you won't be able to usefully reduce your problem (because all of the values will be on the same side of the pivot). This leads to lots and lots of recursion, and for larger inputs you'll almost always hit the recursion cap.
The fix is simple, just change how you get x:
x = a[random.randrange(i, i+n)]
I like randrange a lot better than randint, but you could use randint(i, i+n-1) if you feel the other way.
Must you use a quicksort? If you can use a heapq or PriorityQueue, the .get/(.pop()) methods automatically implement the sort:
import sys
from queue import PriorityQueue
pq = PriorityQueue()
inp = open(sys.stdin.fileno(), newline='\n')
#inp = ['dag', 'Rug', 'gob', 'kex', 'mog', 'Wes', 'pox', 'sec', 'ego', 'wah'] # for testing
for word in inp:
word = word.rstrip('\n')
pq.put(word)
while not pq.empty():
print(pq.get())
Then test with some large random word input or file e.g.:
shuf /usr/share/dict/words | ./word_pq.py
where shuf is Gnu /usr/local/bin/shuf.

How do I sum up all numbers in a line in csv file in Python?

Basically I have a .csv file:
math,4,5,5,4,4,5
biology,3,4,4,2,3,2
chemistry,3,5,4,3,4,2
english,5,5,5,4,5,5
And in the end I need it to print the arithmetic mean of the subject:
math: 4.5
biology: 3.0
chemistry: 3.5
english: 4.8
I've tried some stuff and I got all of the numbers into a one list, but it doesn't help me much.
Edit: Added, what I have so far.
fail = open("grades.csv", encoding = "UTF-8")
info = []
a = []
for row in fail:
parts = row.strip("\n").split(",")
info.append(parts)
fail.close()
print(info)
for el in info:
print(str(el[0]) + ":")
for i in el[1:]:
a.append(i)
print(a)
for line in open('myfile.csv'):
data = line.split(",")
subject = data[0]
grades = [float(x) for x in data[1:]]
print(subject, sum(grades)/len(grades))
You should use the csv module to process csv data:
import csv
with open("grades.csv", encoding = "UTF-8") as f:
for row in csv.reader(f):
sub, *grades = row
grades = list(map(float, grades))
avg = sum(grades) / len(grades)
print("{}: {}".format(sub, avg))
dic = {}
for i in range(len(df)):
subject = df.loc[i].tolist()[0]
total = df.loc[i].tolist()[1:]
dic[subject] = sum(total)/len(total)
You can use the tolist() method to separate the subject name and the marks. Then you can calculate the average.

Populating python matrix

I'm doing the splitting of the words from the text file in python. I've receive the number of row (c) and a dictionary (word_positions) with index. Then I create a zero matrix (c, index). Here is the code:
from collections import defaultdict
import re
import numpy as np
c=0
f = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r')
for line in f:
c = c + 1
word_positions = {}
with open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r') as f:
index = 0
for word in re.findall(r'[a-z]+', f.read().lower()):
if word not in word_positions:
word_positions[word] = index
index += 1
print(word_positions)
matrix=np.zeros(c,index)
My question: How can I populate the matrix to be able to get this: matrix[c,index] = count, where c - is the number of row, index -the indexed position and count -the number of counted words in a row
Try next:
import re
import numpy as np
from itertools import chain
text = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt')
text_list = text.readlines()
c=0
for i in range(len(text_list)):
c=c+1
text_niz = []
for i in range(len(text_list)):
text_niz.append(text_list[i].lower()) # перевел к нижнему регистру
slovo = []
for j in range(len(text_niz)):
slovo.append(re.split('[^a-z]', text_niz[j])) # токенизация
for e in range(len(slovo)):
while slovo[e].count('') != 0:
slovo[e].remove('') # удалил пустые слова
slovo_list = list(chain(*slovo))
print (slovo_list) # составил список слов
slovo_list=list(set(slovo_list)) # удалил повторяющиеся
x=len(slovo_list)
s = []
for i in range(len(slovo)):
for j in range(len(slovo_list)):
s.append(slovo[i].count(slovo_list[j])) # посчитал количество слов в каждом предложении
matr = np.array(s) # матрица вхождений слов в предложения
d = matr.reshape((c, x)) # преобразовал в матрицу 22*254
It looks like you are trying to create something similar to an n-dimensional list. these are achieved by nesting lists inside themselves as such:
two_d_list = [[0, 1], [1, 2], [example, blah, blah blah]]
words = two_d_list[2]
single_word = two_d_list[2][1] # Notice the second index operator
This concept is very flexible in Python and can also be done with a dictionary nested inside as you would like:
two_d_list = [{"word":1}, {"example":1, "blah":3}]
words = two_d_list[1] # type(words) == dict
single_word = two_d_list[2]["example"] # Similar index operator, but for the dictionary
This achieves what you would like, functionally, but does not use the syntax matrix[c,index], however this syntax does not really exist in python for indexing. Commas within square-brackets usually delineate the elements of list literals. Instead you can access the row's dictionary's element with matrix[c][index] = count
You may be able to overload the index operator to achieve the syntx you want. Here is a question about achieving the syntax you desire. In summary:
Overload the __getitem__(self, inex) function in a wrapper of the list class and set the function to accept a tuple. The tuple can be created without parenthesis, giving the syntax matrix[c, index] = count

calculating means from csv with python's numpy

I have a 10GB (can't fit in RAM) file of the format:
Col1,Col2,Col3,Col4
1,2,3,4
34,256,348,
12,,3,4
So we have columns and missing values and I want to calculate the means of columns 2 and 3. With plain python I would do something like:
def means(rng):
s, e = rng
with open("data.csv") as fd:
title = next(fd)
titles = title.split(',')
print "Means for", ",".join(titles[s:e])
ret = [0] * (e-s)
for c, l in enumerate(fd):
vals = l.split(",")[s:e]
for i, v in enumerate(vals):
try:
ret[i] += int(v)
except ValueError:
pass
return map(lambda s: float(s) / (c + 1), ret)
But I suspect there is a much faster way to do thins with numpy (I am still a novice at it).
Pandas is your best friend:
from pandas.io.parsers import read_csv
from numpy import sum
# Load 10000 elements at a time, you can play with this number to get better
# performance on your machine
my_data = read_csv("data.csv", chunksize=10000)
total = 0
count = 0
for chunk in my_data:
# If you want to exclude NAs from the average, remove the next line
chunk = chunk.fillna(0.0)
total += chunk.sum(skipna=True)
count += chunk.count()
avg = total / count
col1_avg = avg["Col1"]
# ... etc. ...
Try:
import numpy
# read from csv into record array
df = numpy.genfromtxt('test.csv',delimiter=',', usecols=(1,2), skip_header=1, usemask=True)
# calc means on columns
ans = numpy.mean(dat, axis=0)
ans.data will contain an array of all the means for the columns.
EDITS for Updated Question
If you have a 10G file you can chunk it with numpy as well. See this answer.
Something like this:
sums = numpy.array((0,0))
counts = numpy.array((0,0))
fH = open('test.csv')
fH.readline() # skip header
while True:
try:
df = numpy.genfromtxt(itertools.islice(fH, 1000), delimiter=',', usecols=(1,2), usemask=True)
except StopIteration:
break
sums = sums + numpy.sum(df, 0)
counts = counts + numpy.sum(df.mask == False, 0)
fH.close()
means = sums / counts

Categories