re reading a csv file in python without loading it again - python

I made the following code which works but I want to improve it. I don't want to re-read the file, but if I delete sales_input.seek(0) it won't iterate throw each row in sales. How can i improve this?
def computeCritics(mode, cleaned_sales_input = "data/cleaned_sales.csv"):
if mode == 1:
print "creating customer.critics.recommendations"
critics_output = open("data/customer/customer.critics.recommendations",
"wb")
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/customer/books.dict.recommendations",
"r"))
else:
print "creating books.critics.recommendations"
critics_output = open("data/books/books.critics.recommendations",
"wb")
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/books/users.dict.recommendations",
"r"))
critics = {}
# make critics dict and pickle it
for i in ID:
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
if mode == 1:
if int(i) == int(j[2]):
sales_dict[int(j[6])] = 1
else:
if int(i) == int(j[6]):
sales_dict[int(j[2])] = 1
critics[int(i)] = sales_dict
pickle.dump(critics, critics_output)
print "done"
cleaned_sales_input looks like
6042772,2723,3546414,9782072488887,1,9.99,314968
6042769,2723,3546414,9782072488887,1,9.99,314968
...
where number 6 is the book ID and number 0 is the customer ID
I want to get a dict wich looks like
critics = {
CustomerID1: {
BookID1: 1,
BookID2: 0,
........
BookIDX: 0
},
CustomerID2: {
BookID1: 0,
BookID2: 1,
...
}
}
or
critics = {
BookID1: {
CustomerID1: 1,
CustomerID2: 0,
........
CustomerIDX: 0
},
BookID1: {
CustomerID1: 0,
CustomerID2: 1,
...
CustomerIDX: 0
}
}
I hope this isn't to much information

Here are some suggestions:
Let's first look at this code pattern:
for i in ID:
for j in sales:
if int(i) == int(j[2])
notice that i is only being compared with j[2]. That's its only purpose in the loop. int(i) == int(j[2]) can only be True at most once for each i.
So, we can completely remove the for i in ID loop by rewriting it as
for j in sales:
key = j[2]
if key in ID:
Based on the function names getCustomerSet and getBookSet, it sounds as if
ID is a set (as opposed to a list or tuple). We want ID to be a set since
testing membership in a set is O(1) (as opposed to O(n) for a list or tuple).
Next, consider this line:
critics[int(i)] = sales_dict
There is a potential pitfall here. This line is assigning sales_dict to
critics[int(i)] for each i in ID. Each key int(i) is being mapped to the very same dict. As we loop through sales and ID, we are modifying sales_dict like this, for example:
sales_dict[int(j[6])] = 1
But this will cause all values in critics to be modified simultaneously, since all keys in critics point to the same dict, sales_dict. I doubt that is what you want.
To avoid this pitfall, we need to make copies of the sales_dict:
critics = {i:sales_dict.copy() for i in ID}
def computeCritics(mode, cleaned_sales_input="data/cleaned_sales.csv"):
if mode == 1:
filename = 'customer.critics.recommendations'
path = os.path.join("data/customer", filename)
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/customer/books.dict.recommendations", "r"))
key_idx, other_idx = 2, 6
else:
filename = 'books.critics.recommendations'
path = os.path.join("data/books", filename)
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/books/users.dict.recommendations", "r"))
key_idx, other_idx = 6, 2
print "creating {}".format(filename)
ID = {int(item) for item in ID}
critics = {i:sales_dict.copy() for i in ID}
with open(path, "wb") as critics_output:
# make critics dict and pickle it
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
key = int(j[key_idx])
if key in ID:
other_key = int(j[other_idx])
critics[key][other_key] = 1
critics[key] = sales_dict
pickle.dump(dict(critics), critics_output)
print "done"

#unutbu's answer is better but if you are stuck with this structure you can put the whole file in memory:
sales = []
with open(cleaned_sales_input, 'rb') as sales_input:
sales_reader = csv.reader(sales_input)
[sales.append(line) for line in sales_reader]
for i in ID:
for j in sales:
#do stuff

Related

I want to use my variables in one method in another method

I want to use the variables in my "Person Builder" function in the JSON I will create, but I cannot pull variables like "PersonID" in the "jsonAPI" function. How do I solve this problem?
My Code :
def PersonBuilder():
PersonID = 1
PersonName = "Behzat"
PersonSurname = "Çözer"
PersonCompany = "EGM"
PersonTitle = "Başkomiser"
return {'PersonID': PersonID,'PersonName': PersonName, 'PersonSurname': PersonSurname,'PersonCompany': PersonCompany, 'PersonTitle': PersonTitle}
def PhoneNumberBuilder():
PhoneID = 1
PhoneCountry = "Turkey"
PhoneOperator = "TR XXXXXX"
PhoneNumber = "+905XXXXXXXX"
return {'PhoneID': PhoneID,'PhoneCountry': PhoneCountry, 'PhoneOperator': PhoneOperator,'PhoneNumber': PhoneNumber}
def jsonAPI():
myjson3 = {
"Person":{
'PersonID' : PersonID,
'PersonName' : PersonName,
'PersonSurname': PersonSurname,
'PersonCompany': PersonCompany,
'PersonTitle': PersonTitle,
'PhoneID':PhoneID,
'PhoneCountry': PhoneCountry,
'PhoneOperator':PhoneOperator,
'PhoneNumber':PhoneNumber
}
}
out_file = open("myfile.json", "w")
json.dump(myjson3, out_file, indent = 6)
jsonify(myjson3)
if __name__ == "__main__":
The functions return dictionaries. You can combine these two dictionaries to get the Person dictionary in myjson3.
def jsonAPI():
myjson3 = {
"Person": PersonBuilder() | PhoneNumberBuilder()
}
with open("myfile.json", "w") as out_file:
json.dump(myjson3, out_file, index = 6)
jsonify(myjson3)
Call the methods and use the values they return.
def jsonAPI():
myjson3 = PersonBuilder() | PhoneNumberBuilder()
# ... etc

Fill tables in a template Word with Python (DocxTemplate, Jinja2)

I am trying to fill with Python a table in Word with DocxTemplate and I have some issues to do it properly. I want to use 2 dictionnaries to fill the data in 1 table, in the figure below.
Table to fill
The 2 dictionnaries are filled in a loop and I write the template document at the end.
The input document to create my dictionnaries is an DB extraction written in SQL.
My main issue is when I want to fill the table with my data in the 2 different dictionnaries.
In the code below I will give as an example the 2 dictionnaries with values in it.
# -*- coding: utf8 -*-
#
#
from docxtpl import DocxTemplate
if __name__ == "__main__":
document = DocxTemplate("template.docx")
DicoOccuTable = {'`num_carnet_adresses`': '`annuaire_telephonique`\n`carnet_adresses`\n`carnet_adresses_complement',
'`num_eleve`': '`CFA_apprentissage_ctrl_coherence`\n`CFA_apprentissage_ctrl_examen`}
DicoChamp = {'`num_carnet_adresses`': 72, '`num_eleve`': 66}
template_values = {}
#
template_values["keys"] = [[{"name":cle, "occu":val} for cle,val in DicoChamp.items()],
[{"table":vals} for cles,vals in DicoOccuTable.items()]]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
As a result the two lines for the table are created but nothing is written within...
I would like to add that it's only been 1 week that I work on Python, so I feel that I don't manage properly the different objects here.
If you have any suggestion to help me, I would appreciate it !
I put here the loop to create the dictionnaries, it may help you to understand why I coded it wrong :)
for c in ChampList:
with open("db_reference.sql", "r") as f:
listTable = []
line = f.readlines()
for l in line:
if 'CREATE TABLE' in l:
begin = True
linecreateTable = l
x = linecreateTable.split()
nomTable = x[2]
elif c in l and begin == True:
listTable.append(nomTable)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in l:
begin = False
nbreOccu=len(listTable)
Tables = "\n".join(listTable)
DicoChamp.update({c:nbreOccu})
DicoOccuTable.update({c:Tables})
# DicoChamp = {c:nbreOccu}
template_values = {}
Thank You very much !
Finally I found a solution for this problem. Here it is.
Instead of using 2 dictionnaries I created 1 dictionnary with this strucuture :
Dico = { Champ : [Occu , Tables] }
The full code for creating the table is detailed below :
from docxtpl import DocxTemplate
document = DocxTemplate("template.docx")
template_values = {}
Context = {}
for c in ChampList:
listTable = []
nbreOccu = 0
OccuTables = []
with open("db_reference.sql", "r") as g:
listTable = []
ligne = g.readlines()
for li in ligne:
if 'CREATE TABLE' in li:
begin = True
linecreateTable2 = li
y = linecreateTable2.split()
nomTable2 = y[2]
elif c in li and begin == True:
listTable.append(nomTable2)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in li:
begin = False
elif '/*!40101 SET COLLATION_CONNECTION=#OLD_COLLATION_CONNECTION */;' in li:
nbreOccu=len(listTable)
inter = "\n".join(listTable)
OccuTables.append(nbreOccu)
OccuTables.append(inter)
ChampNumPropre = c.replace('`','')
Context.update({ChampNumPropre:OccuTables})
else:
continue
template_values["keys"] = [{"label":cle, "cols":val} for cle,val in Context.items()]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
And I used a table with the following structure :
I hope you will find your answers here and good luck !

Parse a Generated File Python

I'm trying to parse generated files into a list of objects.
Unfortunately the structure of the generated files is not always the same, but they contain the same fields (and lots of other garbage).
For example:
function foo(); # Don't Care
function maybeanotherfoo(); # Don't Care
int maybemoregarbage; # Don't Care
product_serial = "CDE1102"; # I want this <---------------------
unnecessary_info1 = 10; # Don't Care
unnecessary_info2 = "red" # Don't Care
product_id = 1134412; # I want this <---------------------
unnecessary_info3 = "88" # Don't Care
product_serial = "DD1232"; # I want this <---------------------
product_id = 3345111; # I want this <---------------------
unnecessary_info1 = "22" # Don't Care
unnecessary_info2 = "panda" # Don't Care
product_serial = "CDE1102"; # I want this <---------------------
unnecessary_info1 = 10; # Don't Care
unnecessary_info2 = "red" # Don't Care
unnecessary_info3 = "bear" # Don't Care
unnecessary_info4 = 119 # Don't Care
product_id = 1112331; # I want this <---------------------
unnecessary_info5 = "jj" # Don't Care
I want a list of objects (each object has: serial and id).
I have tried the following:
import re
class Product:
def __init__(self, id, serial):
self.product_id = id
self.product_serial = serial
linenum = 0
first_string = "product_serial"
second_string = "product_id"
with open('products.txt', "r") as products_file:
for line in products_file:
linenum += 1
if line.find(first_string) != -1:
product_serial = re.search('\"([^"]+)', line).group(1)
#How do I proceed?
Any advice would be greatly appreciated!
Thanks!
I've inlined the data here using an io.StringIO(), but you can substitute data for your products_file.
The idea is that we gather key/values into current_object, and as soon as we have all the data we know we need for a single object (the two keys), we push it onto a list of objects and prime a new current_object.
You could use something like if line.startswith('product_serial') instead of the admittedly complex regexp.
import io
import re
data = io.StringIO("""
function foo();
function maybeanotherfoo();
int maybemoregarbage;
product_serial = "CDE1102";
unnecessary_info1 = 10;
unnecessary_info2 = "red"
product_id = 1134412;
unnecessary_info3 = "88"
product_serial = "DD1232";
product_id = 3345111;
unnecessary_info1 = "22"
unnecessary_info2 = "panda"
product_serial = "CDE1102";
unnecessary_info1 = 10;
unnecessary_info2 = "red"
unnecessary_info3 = "bear"
unnecessary_info4 = 119
product_id = 1112331;
unnecessary_info5 = "jj"
""")
objects = []
current_object = {}
for line in data:
line = line.strip() # Remove leading and trailing whitespace
m = re.match(r'^(product_id|product_serial)\s*=\s*(\d+|"(?:.+?)");?$', line)
if m:
key, value = m.groups()
current_object[key] = value.strip('"')
if len(current_object) == 2: # Got the two keys we want, ship the object
objects.append(current_object)
current_object = {}
print(objects)

Best simple and effective solution for data organization

I'm new with Python programming so I'm doing a bunch of practice exercise in order to improve my skills.
Therefore, I would like to show you guys my approach on this example and if you could let me know what you think I would be grateful!
Exercise:
Given a list of costumers IDs, I have to segment them by the following logic:
If ID is multiple of 7 and multiple of 3 then segment 'A'
If ID is multiple of 3 then segment 'B'
If ID is multiple of 7, then segment 'C'
Else, segment 'D'
What I've done:
from collections import Counter
from datetime import datetime
import os.path
import json
date = datetime.today().strftime('%Y-%m-%d')
customer_indices = [list of IDs] ex: [123981,12398,123157,12371...]
def segment(customer):
if customer % 7 == 0 & customer % 3 == 0:
return 'A'
elif customer % 7 == 0:
return 'B'
elif customer % 3 == 0:
return 'C'
else:
return 'D'
def split_customers(customers):
a = []
b = []
c = []
d = []
for customer in customers:
if customer % 7 == 0 & customer % 3 == 0:
a.append(customer)
elif customer % 7 == 0:
b.append(customer)
elif customer % 3 == 0:
c.append(customer)
else:
d.append(customer)
return a,b,c,d
segmentation = [segment(customer) for customer in customer_indices]
print('Segmentation list: ')
print(segmentation)
print('\n')
segmentation_counter = Counter(segmentation)
print('Count of clients per segment: ')
print(f"A: {segmentation_counter['A']}")
print(f"B: {segmentation_counter['B']}")
print(f"C: {segmentation_counter['C']}")
print(f"D: {segmentation_counter['D']}")
a, b, c, d = split_customers(customer_indices)
main_dict = {'Date': date,
'Segmentation': {
'A Clients': {
'Count': segmentation_counter['A'],
'Customers': a},
'B Clients': {
'Count': segmentation_counter['B'],
'Customers': b},
'C Clients': {
'Count': segmentation_counter['C'],
'Customers': c},
'D Clients': {
'Count': segmentation_counter['D'],
'Customers': d}}}
main_list = [main_dict]
if not os.path.exists('Data/customer_segmentation.json'):
os.makedirs('Data')
if os.path.isfile('Data/customer_segmentation.json'):
with open('Data/customer_segmentation.json') as file:
data = json.load(file)
file.close()
data.append(main_dict)
with open('Data/customer_segmentation.json', 'w') as file:
json.dump(data, file, indent=2)
file.close()
else:
file = open('Data/customer_segmentation.json', 'w')
json.dump(main_list, file, indent=2)
file.close()
The original code has a with open txt function at first that extracts the client's ID list from a txt within the same directory of this .py
The main idea of this solution it would be to execute this every day so the json file will update with a new list for each day that it's run, so if I want to do an analysis of the segmentation growth, it would be pretty easy to do so.
What do you think?

Aggregating values in one column by their corresponding value in another from two files

had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example:
1:5
2:4
3:2
1:4
Very basic but I'm looking for an output that looks like:
1:9
2:4
3:2
In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).
I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as:
Britney Spears (289) 2393140. Any help or input would be so appreciated.
import codecs
#from collections import defaultdict
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]
artists = {}
for a in artist_list:
artistID, name = a[0], a[1]
artists[artistID] = name
grouped_user_stats = {}
for u in user_stats_list:
userID, artistID, weight = u
grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
for artistID, weight in u:
grouped_user_stats.groupby('artistID')['weight'].sum()
print(grouped_user_stats.groupby('artistID')['weight'].sum())
#if userID not in grouped_user_stats:
#grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
#else:
#if artistID not in grouped_user_stats[userID]:
#grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
#else:
#grouped_user_stats[userID][artistID]['plays'] += 1
#print('this never happens')
#print(grouped_user_stats)
how about:
import codecs
from collections import defaultdict
# read stuff
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
# transform artist data in a dict with "artist id" as key and "artist name" as value
artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
user_stats_list = [x.strip().split('\t') for x in users][1:]
grouped_user_stats = defaultdict(lambda:0)
for u in user_stats_list:
#userID, artistID, weight = u
grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
# extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format
grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
# lastly print it
for k, v in grouped_user_stats.iteritems():
print k,v

Categories