I have a csv file having data as id,name and parentid and i want to draw a tree using that data file.Please help
I did this codebut pandas is not working
from ete3 import Tree
import numpy as np
import csv
import pandas as pd
f=open("UdemyT21.csv","r")
csvreader=csv.reader(f)
next(f)
test=list(csvreader)
header=['ID','Name','ParentId']
mydict = {rows[0]:rows[2] for rows in test}
for key in mydict.items():
for value in mydict.items():
if value==None:
t=Tree(key)
t.show()
else:
t=Tree("(value,key);")
t.show()
You can use the ete3 library pip install ete3 PyQt5. The tutorial to export the tree to png can be found here.
An example of this, assuming you have the csv loaded as an array [parent, id] (assuming id is the name):
from ete3 import Tree, TreeStyle, TextFace, add_face_to_node
from collections import defaultdict
def build_tree(nodes):
root = None
for i in nodes:
if i[0] == -1:
root = i
if not root:
raise ValueError('no root!')
data = defaultdict(list)
for i in nodes:
if i[0] != -1:
data[i[1]] = []
data[i[0]].append(i[1])
return data, root[1]
def dict_tree_to_str(tree, root):
if not tree[root]:
return f'{root}'
subtrees = [dict_tree_to_str(tree, st) for st in tree[root]]
return f'({",".join(subtrees)}){root}'
tree_csv = [[-1, 1], [1, 2], [1, 3], [2, 4], [3, 5], [3, 6], [5, 7]]
tree_dict, root = build_tree(tree_csv)
t = Tree(dict_tree_to_str(tree_dict, root)+';', format=1)
ts = TreeStyle()
ts.show_leaf_name = False
def my_layout(node): # https://github.com/etetoolkit/ete/issues/219
F = TextFace(node.name, tight_text=True)
add_face_to_node(F, node, column=0, position="branch-right")
ts.layout_fn = my_layout
t.render("mytree.png", w=183, units="mm", tree_style=ts)
Related
I have an xml file with 50 entries
I'm supposed to extract population density and economy and print only the top 10. I extracted the data but not sure how to print only the top 10. Here's what I have so far:
import xml.etree.ElementTree as ET
tree = ET.parse("europe.xml")
stuff = tree.getroot()
lst = stuff.findall("country")
for item in lst:
gdp = int(item.find("gdppc").text)
pop = int(item.find("population").text)
area = float(item.find("area").text)
economy = [gdp*pop]
density = float(pop/area)
print(item.get("name"))
print(sorted(economy))
print(f"{density:.2f}")
Code like this may work, wasn't able to test because didn't had the .xml file.
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse("europe.xml")
stuff = tree.getroot()
lst = stuff.findall("country")
data = []
for item in lst:
gdp = int(item.find("gdppc").text)
pop = int(item.find("population").text)
area = float(item.find("area").text)
economy = [gdp * pop]
density = float(pop / area)
data.append([gdp, pop, area, economy, density])
df = pd.DataFrame(data, columns=['gdp', 'pop', 'area', 'economy', 'desity'])
df.sort_values(by=['pop'], ascending=False)
I'm trying to do multiprocessing with python but I have duplicated values in results files.
Could you please help me to solve that?
here is my code:
import itertools
from multiprocessing import Pool
from multiprocessing import Manager
import pandas as pd
PARAMS = {}
LPT_LIMIT = [4, 6, 8, 10]
HPT_LIMIT = [1.6, 1.8, 2.0]
NB_FLIGHT = [10, 30]
LPT_EXCEEDENCE = [1, 4]
HPT_EXCEEDENCE = [3, 4]
tmp = [LPT_LIMIT, HPT_LIMIT, NB_FLIGHT, LPT_EXCEEDENCE, HPT_EXCEEDENCE]
parameters = list(itertools.product(*tmp))
def toto(param):
PARAMS['LPT_LMIT'] = param[0]
PARAMS['HPT_LMIT'] = param[1]
PARAMS['NB_FLIGHT'] = param[2]
PARAMS['LPT_EXCEEDENCE'] = param[3]
PARAMS['HPT_EXCEEDENCE'] = param[4]
return PARAMS
if __name__=='__main__':
pool = Pool()
manager = Manager()
my_list = manager.list()
my_list.append(pool.map(toto, parameters))
flat_list = [item for sublist in my_list for item in sublist]
pd.DataFrame(flat_list).to_excel('results.xlsx', index=False)
the results is that I have only value of 4 in HPT_EXCEEDENCE (please see attached file)
enter image description here
I have only HPT_EXCEEDENCE = 4 but HPT_EXCEEDENCE is 3 or 4
So I don't know what's wrong with my code
Unless you are doing something more complex, you don't need Manager(). The problem specifically was the location of PARAMS = {}. See updated code below. This seems to get the result you want.
import itertools
from multiprocessing import Pool
from multiprocessing import Manager
import pandas as pd
LPT_LIMIT = [4, 6, 8, 10]
HPT_LIMIT = [1.6, 1.8, 2.0]
NB_FLIGHT = [10, 30]
LPT_EXCEEDENCE = [1, 4]
HPT_EXCEEDENCE = [3, 4]
tmp = [LPT_LIMIT, HPT_LIMIT, NB_FLIGHT, LPT_EXCEEDENCE, HPT_EXCEEDENCE]
parameters = list(itertools.product(*tmp))
def toto(param):
PARAMS = {}
PARAMS['LPT_LMIT'] = param[0]
PARAMS['HPT_LMIT'] = param[1]
PARAMS['NB_FLIGHT'] = param[2]
PARAMS['LPT_EXCEEDENCE'] = param[3]
PARAMS['HPT_EXCEEDENCE'] = param[4]
return PARAMS
if __name__=='__main__':
pool = Pool()
my_list = pool.map(toto, parameters)
pd.DataFrame(my_list).to_excel('results1.xlsx', index=False)
I'm trying to load List[np.ndarray] into shared_memory such that other processes can directly access this shared_memory and recover the original List[np.ndarray] without copying List[np.ndarray] into every process. The detailed motivation is related to my previous question: share read-only generic complex python object with int, list of numpy array, tuple, etc. as instance field between multiprocessing
I wrote the following code(python version: 3.8.12, Numpy:1.20.3, MacOS):
encode_nd_arr_list(): given List[np.ndarray], I can get List of share_memory name.
decode_nd_arr_list(): given List of share_memory name, I can recover original List[np.ndarray].
from typing import List, Tuple
import numpy as np
from multiprocessing.shared_memory import SharedMemory
from multiprocessing.managers import SharedMemoryManager
def encode_nd_arr_list(
smm: SharedMemoryManager,
nd_arr_list: List[np.ndarray]
):
shm_name_list = []
shape, dtype = nd_arr_list[0].shape, nd_arr_list[0].dtype
print(shape)
print(dtype)
for nd_arr in nd_arr_list:
shm = smm.SharedMemory(size=nd_arr.nbytes)
shm_arr = np.ndarray(shape=shape, dtype=dtype, buffer=shm.buf)
np.copyto(shm_arr, nd_arr)
shm_name_list.append(shm.name)
return shm_name_list, shape, dtype
def decode_nd_arr_list(
shm_name_list: List[str],
shape: Tuple[int],
dtype: np.dtype
):
nd_array_list = []
for shm_name in shm_name_list:
print("----------")
shm = SharedMemory(shm_name)
nd_arr = np.ndarray(shape=shape, dtype=dtype, buffer=shm.buf)
print("nd_arr:", nd_arr)
nd_array_list.append(nd_arr)
print("nd_array_list:", nd_array_list)
return nd_array_list
if __name__ == '__main__':
arr = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
nd_arr_list = [arr, arr + 1, arr + 2]
print(nd_arr_list)
with SharedMemoryManager() as smm:
shm_name_list, shape, dtype = encode_nd_arr_list(smm, nd_arr_list)
print(shm_name_list)
print(shape)
print(dtype)
res = decode_nd_arr_list(shm_name_list, shape, dtype)
print("------------")
print(res)
However, when I run it in PyCharm, the console shows Process finished with exit code 139 (interrupted by signal 11: SIGSEGV). When I run it in the terminal, it shows segmentation fault, without any error information.
My Question:
What's this fault meaning in my case?
How can I make my code work? Thanks.
The buffers that are used in each iteration of the loop in the decode_nd_arr_list method get closed after the corresponding SharedMemory object goes out of scope and that causes the segfault. You are essentially trying to access a memory that is no longer valid.
In order to fix it, you can create a custom object that wraps around the ndarray and also stores the SharedMemory to prevent it from going out of scope.
Example:
from typing import List, Tuple
import numpy as np
from multiprocessing.shared_memory import SharedMemory
from multiprocessing.managers import SharedMemoryManager
class SHMArray(np.ndarray):
def __new__(cls, input_array, shm=None):
obj = np.asarray(input_array).view(cls)
obj.shm = shm
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.shm = getattr(obj, 'shm', None)
def encode_nd_arr_list(
smm: SharedMemoryManager,
nd_arr_list: List[np.ndarray]
):
shm_name_list = []
shape, dtype = nd_arr_list[0].shape, nd_arr_list[0].dtype
print(shape)
print(dtype)
for nd_arr in nd_arr_list:
shm = smm.SharedMemory(size=nd_arr.nbytes)
shm_arr = np.ndarray(shape=shape, dtype=dtype, buffer=shm.buf)
np.copyto(shm_arr, nd_arr)
shm_name_list.append(shm.name)
return shm_name_list, shape, dtype
def decode_nd_arr_list(
shm_name_list: List[str],
shape: Tuple[int],
dtype: np.dtype
):
nd_array_list = []
for shm_name in shm_name_list:
print("----------")
shm = SharedMemory(shm_name)
nd_arr = SHMArray(np.ndarray(shape=shape, dtype=dtype, buffer=shm.buf), shm)
print("nd_arr:", nd_arr)
nd_array_list.append(nd_arr)
print("nd_array_list:", nd_array_list)
return nd_array_list
if __name__ == '__main__':
arr = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
nd_arr_list = [arr, arr + 1, arr + 2]
print(nd_arr_list)
with SharedMemoryManager() as smm:
shm_name_list, shape, dtype = encode_nd_arr_list(smm, nd_arr_list)
print(shm_name_list)
print(shape)
print(dtype)
res = decode_nd_arr_list(shm_name_list, shape, dtype)
print("------------")
print(res)
Reference: https://github.com/numpy/numpy/issues/18294#issuecomment-771329575
I'm trying to apply a json-patch to a Mongoengine Document.
I'm using these json-patch library: https://github.com/stefankoegl/python-json-patch and mongoengine 0.14.3 with python 3.6.3
This is my actual code:
json_patch = JsonPatch.from_string(jp_string)
document = Document.objects(id=document_id)
json_documents = json.loads(document.as_pymongo().to_json())
json_patched_document = json_patch.apply(json_documents[0])
Document.objects(id=document_id).first().delete()
Document
.from_json(json.dumps(json_patched_document))
.save(force_insert=True)
Is there a better way to save an edited json document?
I've enhanced a little bit the code:
json_patch = JsonPatch.from_string(jp_string)
document = Document.objects(id=document_id)
json_document = json.loads(document.as_pymongo().to_json())
json_patched_document = json_patch.apply(json_documents[0])
Document
.from_json(json.dumps(json_patched_document), created=True)
.save()
but, is there a way to not convert the document to json?
I had slightly similar problem, the part that I dont wanted the complete Document for saving, I just wanted to update fields which are modified/added.
heres the code I tests on below inputs:
def tryjsonpatch():
doc_in_db = {'foo': 'bar', "name": "aj", 'numbers': [1, 3, 7, 8]}
input = {'foo': 'bar', "name": "dj", 'numbers': [1, 3, 4, 8]}
input2 = {'foo': 'bar', "name": "aj", 'numbers': [1, 3, 7, 8], "extera": "12"}
input3 = {'foo': 'bar', "name": "dj", 'numbers': [1, 3, 4, 8], "extera": "12"}
patch = jsonpatch.JsonPatch.from_diff(doc_in_db, input3)
print("\n***patch***\n", patch)
doc = get_minimal_doc(doc_in_db, patch)
result = patch.apply(doc, in_place=True)
print("\n###result###\n", result,
"\n###present###\n", doc_in_db)
def get_minimal_doc(present, patch):
cur_dc = {}
for change in patch.patch:
if change['op'] not in ("add"):
keys = change['path'].split("/")[1:]
present_move = {}
old_key = 1
first = True
for key in keys:
if key.isdigit(): # old_key represented a array
cur_dc[old_key] = present_move
else:
if first:
cur_dc[key] = {}
first = False
else:
cur_dc[old_key][key] = {}
old_key = key
present_move = present[old_key]
return cur_dc
tryjsonpatch()
How to do Channel Attribution (Markov Chain Model) in Python? Like we have 'ChannelAttribution' package in R.
I wrote the python code for multi channel attribution model. https://www.linkedin.com/pulse/multi-channel-attribution-model-python-sheranga-gamwasam/
Here is the dataset link:https://docs.google.com/spreadsheets/d/11pa-eQDHEX63uSEA4eWiDTOZ7lbO6Vwt-dHkuhuhbSo/edit?usp=sharing
import time
import pandas as pd
import numpy as np
import collections
from itertools import chain
import itertools
from scipy.stats import stats
import statistics
def unique(list1):
unique_list = []
for x in list1:
if x not in unique_list:
unique_list.append(x)
return(unique_list)
def split_fun(path):
return path.split('>')
def calculate_rank(vector):
a={}
rank=0
for num in sorted(vector):
if num not in a:
a[num]=rank
rank=rank+1
return[a[i] for i in vector]
def transition_matrix_func(import_data):
z_import_data=import_data.copy()
z_import_data['path1']='start>'+z_import_data['path']
z_import_data['path2']=z_import_data['path1']+'>convert'
z_import_data['pair']=z_import_data['path2'].apply(split_fun)
zlist=z_import_data['pair'].tolist()
zlist=list(chain.from_iterable(zlist))
zlist=list(map(str.strip, zlist))
T=calculate_rank(zlist)
M = [[0]*len(unique(zlist)) for _ in range(len(unique(zlist)))]
for (i,j) in zip(T,T[1:]):
M[i][j] += 1
x_df=pd.DataFrame(M)
np.fill_diagonal(x_df.values,0)
x_df=pd.DataFrame(x_df.values/x_df.values.sum(axis=1)[:,None])
x_df.columns=sorted(unique(zlist))
x_df['index']=sorted(unique(zlist))
x_df.set_index("index", inplace = True)
x_df.loc['convert',:]=0
return(x_df)
def simulation(trans,n):
sim=['']*n
sim[0]= 'start'
i=1
while i<n:
sim[i] = np.random.choice(trans.columns, 1, p=trans.loc[sim[i-1],:])[0]
if sim[i]=='convert':
break
i=i+1
return sim[0:i+1]
def markov_chain(data_set,no_iteration=10,no_of_simulation=10000,alpha=5):
import_dataset_v1=data_set.copy()
import_dataset_v1=(import_dataset_v1.reindex(import_dataset_v1.index.repeat(import_dataset_v1.conversions))).reset_index()
import_dataset_v1['conversions']=1
import_dataset_v1=import_dataset_v1[['path','conversions']]
import_dataset=(import_dataset_v1.groupby(['path']).sum()).reset_index()
import_dataset['probability']=import_dataset['conversions']/import_dataset['conversions'].sum()
final=pd.DataFrame()
for k in range(0,no_iteration):
start = time.time()
import_data=pd.DataFrame({'path':np.random.choice(import_dataset['path'],size=import_dataset['conversions'].sum(),p=import_dataset['probability'],replace=True)})
import_data['conversions']=1
tr_matrix=transition_matrix_func(import_data)
channel_only = list(filter(lambda k0: k0 not in ['start','convert'], tr_matrix.columns))
ga_ex=pd.DataFrame()
tr_mat=tr_matrix.copy()
p=[]
i=0
while i<no_of_simulation:
p.append(unique(simulation(tr_mat,1000)))
i=i+1
path=list(itertools.chain.from_iterable(p))
counter=collections.Counter(path)
df=pd.DataFrame({'path':list(counter.keys()),'count':list(counter.values())})
df=df[['path','count']]
ga_ex=ga_ex.append(df,ignore_index=True)
df1=(pd.DataFrame(ga_ex.groupby(['path'])[['count']].sum())).reset_index()
df1['removal_effects']=df1['count']/len(path)
#df1['removal_effects']=df1['count']/sum(df1['count'][df1['path']=='convert'])
df1=df1[df1['path'].isin(channel_only)]
df1['ass_conversion']=df1['removal_effects']/sum(df1['removal_effects'])
df1['ass_conversion']=df1['ass_conversion']*sum(import_dataset['conversions'])
final=final.append(df1,ignore_index=True)
end = time.time()
t1=(end - start)
print(t1)
'''
H0: u=0
H1: u>0
'''
unique_channel=unique(final['path'])
#final=(pd.DataFrame(final.groupby(['path'])[['ass_conversion']].mean())).reset_index()
final_df=pd.DataFrame()
for i in range(0,len(unique_channel)):
x=(final['ass_conversion'][final['path']==unique_channel[i]]).values
final_df.loc[i,0]=unique_channel[i]
final_df.loc[i,1]=x.mean()
v=stats.ttest_1samp(x,0)
final_df.loc[i,2]=v[1]/2
if v[1]/2<=alpha/100:
final_df.loc[i,3]=str(100-alpha)+'% statistically confidence'
else:
final_df.loc[i,3]=str(100-alpha)+'% statistically not confidence'
final_df.loc[i,4]=len(x)
final_df.loc[i,5]=statistics.stdev(x)
final_df.loc[i,6]=v[0]
final_df.columns=['channel','ass_conversion','p_value','confidence_status','frequency','standard_deviation','t_statistics']
final_df['ass_conversion']=sum(import_dataset['conversions']) *final_df['ass_conversion'] /sum(final_df['ass_conversion'])
return final_df,final
import_dataset=pd.read_csv('channel attribution example.csv')
data,dataset=markov_chain(import_dataset,no_iteration=10,no_of_simulation=10000,alpha=5)