Best simple and effective solution for data organization - python

I'm new with Python programming so I'm doing a bunch of practice exercise in order to improve my skills.
Therefore, I would like to show you guys my approach on this example and if you could let me know what you think I would be grateful!
Exercise:
Given a list of costumers IDs, I have to segment them by the following logic:
If ID is multiple of 7 and multiple of 3 then segment 'A'
If ID is multiple of 3 then segment 'B'
If ID is multiple of 7, then segment 'C'
Else, segment 'D'
What I've done:
from collections import Counter
from datetime import datetime
import os.path
import json
date = datetime.today().strftime('%Y-%m-%d')
customer_indices = [list of IDs] ex: [123981,12398,123157,12371...]
def segment(customer):
if customer % 7 == 0 & customer % 3 == 0:
return 'A'
elif customer % 7 == 0:
return 'B'
elif customer % 3 == 0:
return 'C'
else:
return 'D'
def split_customers(customers):
a = []
b = []
c = []
d = []
for customer in customers:
if customer % 7 == 0 & customer % 3 == 0:
a.append(customer)
elif customer % 7 == 0:
b.append(customer)
elif customer % 3 == 0:
c.append(customer)
else:
d.append(customer)
return a,b,c,d
segmentation = [segment(customer) for customer in customer_indices]
print('Segmentation list: ')
print(segmentation)
print('\n')
segmentation_counter = Counter(segmentation)
print('Count of clients per segment: ')
print(f"A: {segmentation_counter['A']}")
print(f"B: {segmentation_counter['B']}")
print(f"C: {segmentation_counter['C']}")
print(f"D: {segmentation_counter['D']}")
a, b, c, d = split_customers(customer_indices)
main_dict = {'Date': date,
'Segmentation': {
'A Clients': {
'Count': segmentation_counter['A'],
'Customers': a},
'B Clients': {
'Count': segmentation_counter['B'],
'Customers': b},
'C Clients': {
'Count': segmentation_counter['C'],
'Customers': c},
'D Clients': {
'Count': segmentation_counter['D'],
'Customers': d}}}
main_list = [main_dict]
if not os.path.exists('Data/customer_segmentation.json'):
os.makedirs('Data')
if os.path.isfile('Data/customer_segmentation.json'):
with open('Data/customer_segmentation.json') as file:
data = json.load(file)
file.close()
data.append(main_dict)
with open('Data/customer_segmentation.json', 'w') as file:
json.dump(data, file, indent=2)
file.close()
else:
file = open('Data/customer_segmentation.json', 'w')
json.dump(main_list, file, indent=2)
file.close()
The original code has a with open txt function at first that extracts the client's ID list from a txt within the same directory of this .py
The main idea of this solution it would be to execute this every day so the json file will update with a new list for each day that it's run, so if I want to do an analysis of the segmentation growth, it would be pretty easy to do so.
What do you think?

Related

Prompting user to enter column names from a csv file (not using pandas framework)

I am trying to get the column names from a csv file with nearly 4000 rows. There are about 14 columns.
I am trying to get each column and store it into a list and then prompt the user to enter themselves at least 5 columns they want to look at.
The user should then be able to type how many results they want to see (they should be the smallest results from that column).
For example, if they choose clothing_brand, "8", the 8 least expensive brands are displayed.
So far, I have been able to use "with" and get a list that contains each column, but I am having trouble prompting the user to pick at least 5 of those columns.
You can very well use the Python input to get the input from user, if you want to prompt no. of times, use the for loop to get inputs. Check Below code:
def get_user_val(no_of_entries = 5):
print('Enter {} inputs'.format(str(no_of_entries)))
val_list = []
for i in range(no_of_entries):
val_list.append(input('Enter Input {}:'.format(str(i+1))))
return val_list
get_user_val()
I hope I didn't misunderstand what you mean, the code below is what you want?
You can put the data into the dict then sorted it.
Solution1
from io import StringIO
from collections import defaultdict
import csv
import random
import pprint
def random_price():
return random.randint(1, 10000)
def create_test_data(n_row=4000, n_col=14, sep=','):
columns = [chr(65+i) for i in range(n_col)] # A, B ...
title = sep.join(columns)
result_list = [title]
for cur_row in range(n_row):
result_list.append(sep.join([str(random_price()) for _ in range(n_col)]))
return '\n'.join(result_list)
def main():
if 'load CSV':
test_content = create_test_data(n_row=10, n_col=5)
dict_brand = defaultdict(list)
with StringIO(test_content) as f:
rows = csv.reader(f, delimiter=',')
for idx, row in enumerate(rows):
if idx == 0: # title
columns = row
continue
for i, value in enumerate(row):
dict_brand[columns[i]].append(int(value))
pprint.pprint(dict_brand, indent=4, compact=True, width=120)
user_choice = input('input columns (brand)')
number_of_results = 5 # input('...')
watch_columns = user_choice.split(' ') # D E F
for col_name in watch_columns:
cur_brand_list = dict_brand[col_name]
print(sorted(cur_brand_list, reverse=True)[:number_of_results])
# print(f'{col_name} : {sorted(cur_brand_list)}') # ASC
# print(f'{col_name} : {sorted(cur_brand_list, reverse=True)}') # DESC
if __name__ == '__main__':
main()
defaultdict(<class 'list'>,
{ 'A': [9424, 6352, 5854, 5870, 912, 9664, 7280, 8306, 9508, 8230],
'B': [1539, 1559, 4461, 8039, 8541, 4540, 9447, 512, 7480, 5289],
'C': [7701, 6686, 1687, 3134, 5723, 6637, 6073, 1925, 4207, 9640],
'D': [4313, 3812, 157, 6674, 8264, 2636, 765, 2514, 9833, 1810],
'E': [139, 4462, 8005, 8560, 5710, 225, 5288, 6961, 6602, 4609]})
input columns (brand)C D
[9640, 7701, 6686, 6637, 6073]
[9833, 8264, 6674, 4313, 3812]
Solution2: Using Pandas
def pandas_solution(test_content: str, watch_columns= ['C', 'D'], number_of_results=5):
with StringIO(test_content) as f:
df = pd.read_csv(StringIO(f.read()), usecols=watch_columns,
na_filter=False) # it can add performance (ignore na)
dict_result = defaultdict(list)
for col_name in watch_columns:
dict_result[col_name].extend(df[col_name].sort_values(ascending=False).head(number_of_results).to_list())
df = pd.DataFrame.from_dict(dict_result)
print(df)
C D
0 9640 9833
1 7701 8264
2 6686 6674
3 6637 4313
4 6073 3812

loop is not working when I try to read a Json file and a text file with python

I have a json file with objects and a text file with several groups (Each group have 5 numbers and I have them in a list this way: the first number of each group are in list 1, the second number of each group, are in list 2, etc). I basically have to match each object of the json with each group I created. The problem is that Im getting as result the last element from the Json. The groups from the text file are created in the correct way.
This is my code:
import json
NUM_LIST = 5
index = 0
def report(a, b, c, d, e, index):
json_file = 'json_global.json'
json_data = open(json_file)
data = json.load(json_data)
i = 0
index = 0
item = 0
cmd = " "
ind = 0
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
print item, cmd, a, b, c, d, e
f = open("Output.txt", "r")
lines = [line.rstrip() for line in f if line != "\n"]
NUM_LISTS = 5
groups = [[] for i in range(NUM_LISTS)]
listIndex = 0
for line in lines:
if "Transactions/Sec for Group" not in line:
groups[listIndex].append(float(line))
listIndex += 1
if listIndex == NUM_LISTS:
listIndex = 0
value0 = groups[0]
value1 = groups[1]
value2 = groups[2]
value3 = groups[3]
value4 = groups[4]
for i in range(0, 5):
a = value0[i]
b = value1[i]
c = value2[i]
d = value3[i]
e = value4[i]
i += 1
report(a, b, c, d, e, index)
The Json file looks like:
[
{
"item": 1,
"command": "AA"
},
{
"item": 2,
"command": "BB",
},
{
"item": 3,
"command": "CC",
},
{
"item": 4,
"command": "DD",
},
{
"item": 5,
"command": "EE",
}
]
The text file looks like this:
Transactions/Sec for Group = AA\CODE1\KK
1011.5032
2444.8864
2646.6893
2740.8531
2683.8178
Transactions/Sec for Group = BB\CODE1\KK
993.2360
2652.8784
3020.2740
2956.5260
3015.5910
Transactions/Sec for Group = CC\CODE1\KK
1179.5766
3271.5700
4588.2059
4174.6358
4452.6785
Transactions/Sec for Group = DD\CODE1\KK
1112.2567
3147.1466
4014.8404
3913.3806
3939.0626
Transactions/Sec for Group = EE\CODE1\KK
1205.8499
3364.8987
4401.1702
4747.4354
4765.7614
The logic in the body of the program works fine. The groups appears ok, but instead of having the list from 1 to 5 from the Json file, is appearing everything with the number 5 command EE. Instead should appear: Item 1, 2, 3, 4, 5, with their commands
My list 1 will have the numbers: 1011.5032, 993.2360, 1179.5766, 1112.2567, 1205.8499.
My list 2 will have the numbers: 2444.8864, 2652.8784, 3271.5700, 3147.1466,
The python version I'm using is 2.6
Based on your explanation it's hard to tell what you're trying to do -- do you mean the nested loop below? The inner loop executes 5 times, but in every iteration it overwrites the previous values for item and cmd.
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
Try printing the values each time the inner loop executes:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
print item, cmd
index+= 1
I think this code is your problem:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
Item will always be "5" and command will always be "EE" after this executes. Perhaps your indents are off for the code beneath it, and that code is supposed to be within the loop?

Data generation incomplete: Python random

I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)

re reading a csv file in python without loading it again

I made the following code which works but I want to improve it. I don't want to re-read the file, but if I delete sales_input.seek(0) it won't iterate throw each row in sales. How can i improve this?
def computeCritics(mode, cleaned_sales_input = "data/cleaned_sales.csv"):
if mode == 1:
print "creating customer.critics.recommendations"
critics_output = open("data/customer/customer.critics.recommendations",
"wb")
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/customer/books.dict.recommendations",
"r"))
else:
print "creating books.critics.recommendations"
critics_output = open("data/books/books.critics.recommendations",
"wb")
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/books/users.dict.recommendations",
"r"))
critics = {}
# make critics dict and pickle it
for i in ID:
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
if mode == 1:
if int(i) == int(j[2]):
sales_dict[int(j[6])] = 1
else:
if int(i) == int(j[6]):
sales_dict[int(j[2])] = 1
critics[int(i)] = sales_dict
pickle.dump(critics, critics_output)
print "done"
cleaned_sales_input looks like
6042772,2723,3546414,9782072488887,1,9.99,314968
6042769,2723,3546414,9782072488887,1,9.99,314968
...
where number 6 is the book ID and number 0 is the customer ID
I want to get a dict wich looks like
critics = {
CustomerID1: {
BookID1: 1,
BookID2: 0,
........
BookIDX: 0
},
CustomerID2: {
BookID1: 0,
BookID2: 1,
...
}
}
or
critics = {
BookID1: {
CustomerID1: 1,
CustomerID2: 0,
........
CustomerIDX: 0
},
BookID1: {
CustomerID1: 0,
CustomerID2: 1,
...
CustomerIDX: 0
}
}
I hope this isn't to much information
Here are some suggestions:
Let's first look at this code pattern:
for i in ID:
for j in sales:
if int(i) == int(j[2])
notice that i is only being compared with j[2]. That's its only purpose in the loop. int(i) == int(j[2]) can only be True at most once for each i.
So, we can completely remove the for i in ID loop by rewriting it as
for j in sales:
key = j[2]
if key in ID:
Based on the function names getCustomerSet and getBookSet, it sounds as if
ID is a set (as opposed to a list or tuple). We want ID to be a set since
testing membership in a set is O(1) (as opposed to O(n) for a list or tuple).
Next, consider this line:
critics[int(i)] = sales_dict
There is a potential pitfall here. This line is assigning sales_dict to
critics[int(i)] for each i in ID. Each key int(i) is being mapped to the very same dict. As we loop through sales and ID, we are modifying sales_dict like this, for example:
sales_dict[int(j[6])] = 1
But this will cause all values in critics to be modified simultaneously, since all keys in critics point to the same dict, sales_dict. I doubt that is what you want.
To avoid this pitfall, we need to make copies of the sales_dict:
critics = {i:sales_dict.copy() for i in ID}
def computeCritics(mode, cleaned_sales_input="data/cleaned_sales.csv"):
if mode == 1:
filename = 'customer.critics.recommendations'
path = os.path.join("data/customer", filename)
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/customer/books.dict.recommendations", "r"))
key_idx, other_idx = 2, 6
else:
filename = 'books.critics.recommendations'
path = os.path.join("data/books", filename)
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/books/users.dict.recommendations", "r"))
key_idx, other_idx = 6, 2
print "creating {}".format(filename)
ID = {int(item) for item in ID}
critics = {i:sales_dict.copy() for i in ID}
with open(path, "wb") as critics_output:
# make critics dict and pickle it
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
key = int(j[key_idx])
if key in ID:
other_key = int(j[other_idx])
critics[key][other_key] = 1
critics[key] = sales_dict
pickle.dump(dict(critics), critics_output)
print "done"
#unutbu's answer is better but if you are stuck with this structure you can put the whole file in memory:
sales = []
with open(cleaned_sales_input, 'rb') as sales_input:
sales_reader = csv.reader(sales_input)
[sales.append(line) for line in sales_reader]
for i in ID:
for j in sales:
#do stuff

Index similar entries in Python

I have a column of data (easily imported from Google Docs thanks to gspread) that I'd like to intelligently align. I ingest entries into a dictionary. Input can include email, twitter handle or a blog URL. For example:
mike.j#gmail.com
#mikej45
j.mike#world.eu
_http://tumblr.com/mikej45
Right now, the "dumb" version is:
def NomineeCount(spreadsheet):
worksheet = spreadsheet.sheet1
nominees = worksheet.col_values(6) # F = 6
unique_nominees = {}
for c in nominees:
pattern = re.compile(r'\s+')
c = re.sub(pattern, '', c)
if unique_nominees.has_key(c) == True: # If we already have the name
unique_nominees[c] += 1
else:
unique_nominees[c] = 1
# Print out the alphabetical list of nominees with leading vote count
for w in sorted(unique_nominees.keys()):
print string.rjust(str(unique_nominees[w]), 2)+ " " + w
return nominees
What's an efficient(-ish) way to add in some smarts during the if process?
You can try with defaultdict:
from collections import defaultdict
unique_nominees = defaultdict(lambda: 0)
unique_nominees[c] += 1

Categories