I got 10 csvfiles like this :
I want to add 10 columns in my dataframe with a vwap calculation. I tried to create the columns and then to concatenate it into the dataframe but it doesn't work at all. I tried a lot of things, the main problem is that i can't create new columns with calculated rows :
import pandas as pd
import os
import glob
from IPython.display import display, HTML
import csv
# use glob to get all the csv files
# in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join("*.csv"))
"""
#To change the name of every columns
liste1 = []
header_list = []
for f in csv_files:
liste1.append(f)
header_list = [a.strip(".csv") for a in liste1]
"""
def add(f):
df = pd.read_csv(f, header=0)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return df.assign(vwap = (price * amount).cumsum() / amount.cumsum())
for f in csv_files:
df = pd.read_csv(f, header=0)
df2 = pd.concat(add(f))
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
Thanks for your help
The traceback :
TypeError
Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_16732/557098648.py in <module>
31 for f in csv_files:
32 df = pd.read_csv(f, header=0)
---> 33 df2 = pd.concat(add(f))
34 df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
35
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
292 ValueError: Indexes have overlapping values: ['a']
293 """
--> 294 op = _Concatenator(
295 objs,
296 axis=axis,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327 ):
328 if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
--> 329 raise TypeError(
330 "first argument must be an iterable of pandas "
331 f'objects, you passed an object of type "{type(objs).__name__}"'
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
If need only aggregate values in ouput:
def add(df):
#Removed read_csv
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return (price * amount).cumsum() / amount.cumsum()
out = []
for f in csv_files:
df = pd.read_csv(f, header=0)
#added aggregate DataFrame with new column to list of DataFrames
out.append(add(df))
#joined all dfs together
df2 = pd.concat(out, ignore_index=True, axis=1)
#removed append mode
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv",
encoding='utf-8')
Related
So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft
data_sources = ['I:\Data Penelitian\Iphone/iphone.json']
def main():
spd = Spd(data_sources) #class instantiation
start = time.process_time()
relevant_tweets = spd.detector(data_sources)
stop = time.process_time()
return relevant_tweets
class Spd:
""" some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
def __init__(self, data_sources):
self.data_sources = data_sources
pass
# first function in the class:
def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
non_english_tweets = 0 # keep track of the non-English tweets
with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
for line in f.readlines():
non_English = 0
try:
line = json.loads(line)
if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
data_extracts['Language'].append(line['Language'])
data_extracts['TweetID'].append(line['TweetID'])
data_extracts['RawTweets'].append(line['RawTweets'])
data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
data_extracts['CreatedAt'].append(line['CreatedAt'])
data_extracts['AccountCreated'].append(line['AccountCreated'])
data_extracts['ScreenName'].append(line['ScreenName'])
data_extracts['RetweetCount'].append(line['RetweetCount'])
data_extracts['FollowersCount'].append(line['FollowersCount'])
data_extracts['FriendsCount'].append(line['FriendsCount'])
data_extracts['StatusesCount'].append(line['StatusesCount'])
data_extracts['FavouritesCount'].append(line['FavouritesCount'])
data_extracts['UserName'].append(line['UserName'])
data_extracts['Location'].append(line['Location'])
data_extracts['Description'].append(line['Description'])
data_extracts['UserURL'].append(line['UserURL'])
data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
data_extracts['UserID'].append(line['UserID'])
data_extracts['TimeZone'].append(line['TimeZone'])
data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
else:
non_english_tweets +=1
except:
continue
df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
AccountAge = [] # compute the account age of accounts
date_format = "%Y-%m-%d %H:%M:%S"
for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
#try:
dr = str(dr)
dc = str(dc)
d1 = datetime.strptime(dr,date_format)
d2 = datetime.strptime(dc,date_format)
dif = d1 - d2
AccountAge.append(dif.days)
#except:
#continue
df0['AccountAge']=AccountAge
# add/define additional features ...
df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
# Remove all RTs, set UserID as index and save relevant files:
df0 = df0[df0.Retweets.values==False] # remove retweets
df0 = df0.set_index('UserID')
df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
#df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv
with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
d.write('{}'.format(non_english_tweets))
d.close()
return df0
def detector(self, data_sources): # accept list of raw tweets as json objects
self.data_sources = data_sources
for data_sources in data_sources:
self.data_sources = data_sources
df0 = self.extractor(data_sources)
#drop fields not required for predicition
X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
X = X.dropna()
# reload the trained model for use:
spd_filter=pickle.load(open('trained_rf.pkl','rb'))
PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
#relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
#relevant_tweets = relevant_tweets.drop('UserID', axis=1)
# save files:
X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
spam.to_csv(data_source[:5]+'_spam_accounts.csv')
relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
return relevant_tweets # or return relevant_tweets, nonspam, spam
if __name__ =='__main__':
main()
The traceback error is as follow
ValueError Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
142
143 if __name__ =='__main__':
--> 144 main()
<ipython-input-2-5dc56f49d005> in main()
18 spd = Spd(data_sources) #class instantiation
19 start = time.process_time()
---> 20 relevant_tweets = spd.detector(data_sources)
21 stop = time.process_time()
22 return relevant_tweets
<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
126 # reload the trained model for use:
127 spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128 PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
129 X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
130 nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
543 The predicted classes.
544 """
--> 545 proba = self.predict_proba(X)
546
547 if self.n_outputs_ == 1:
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
586 check_is_fitted(self, 'estimators_')
587 # Check data
--> 588 X = self._validate_X_predict(X)
589
590 # Assign chunk of trees to jobs
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 #property
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
548 " minimum of %d is required%s."
549 % (n_samples, array.shape, ensure_min_samples,
--> 550 context))
551
552 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
m
This returns the following data
price 99614.04
dtype: float64
now when I try to use the variable 'm' I receive the following error
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_62032/1757287628.py in <module>
5 df["date"] = pd.to_datetime(df["date"])
6 m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
----> 7 df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
~\Anaconda3\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
67 other = item_from_zerodim(other)
68
---> 69 return method(self, other)
70
71 return new_method
~\Anaconda3\lib\site-packages\pandas\core\arraylike.py in __eq__(self, other)
30 #unpack_zerodim_and_defer("__eq__")
31 def __eq__(self, other):
---> 32 return self._cmp_method(other, operator.eq)
33
34 #unpack_zerodim_and_defer("__ne__")
~\Anaconda3\lib\site-packages\pandas\core\series.py in _cmp_method(self, other, op)
5494
5495 if isinstance(other, Series) and not self._indexed_same(other):
-> 5496 raise ValueError("Can only compare identically-labeled Series objects")
5497
5498 lvalues = self._values
ValueError: Can only compare identically-labeled Series objects
But when I use the actual value for 'm' it works.
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == 99614.04)]
id name price symbol industry market currency date
25 1abf2ffc-3396-4ed9-954d-956be97668c0 Brocade Communications Systems, Inc. 99614.04 BRCD Computer Communications Equipment NASDAQ PLN 2020-09-12
Could someone please explain why this interaction is playing out this way?
Return value is a Series, you can use
m = df.loc[df['market'] == "NASDAQ", 'price'].max(numeric_only=True)
# or
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True).item()
Use the following instead, currently you're returning a Series because you're not specifying from which Column you want to take the max.
m = df[df['market'].eq("NASDAQ")]['price'].max()
I have a FASTA file that looks like this
>Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|Original|hCoV-19^^Hubei|Human|Wuhan Jinyintan Hospital|Wuhan Institute of Virology|Shi|China
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*
>Spike|hCoV-19/Philippines/PH-PGC-03696/2020|2020-12-23|EPI_ISL_2155626|Original|hCoV-19^^Central Luzon|Human|Research Institute for Tropical Medicine|Philippine Genome Center|Tablizo|Philippines
MFVFLVLLPLVFSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYYPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*
>Spike|hCoV-19/Belgium/UZA-UA-8350/2021|2021-01-22|EPI_ISL_940774|Original|hCoV-19^^Berchem|Human|Platform BIS UZA/UAntwerpen|UAntwerp|Xavier|Belgium
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNTVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAQHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCT*
I need to sort these sequences based on the date column, I found this code in stack overflow but it doesn't do the job for
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd
with open('F:/newone.fasta') as fasta_file:
identifiers = []
lengths = []
seq = []
for title, sequence in SimpleFastaParser(fasta_file):
identifiers.append(title.split(None, 3)[0])
lengths.append(len(sequence))
seq.append(sequence)
#converting lists to pandas Series
s1 = pd.Series(identifiers, name='ID')
s2 = pd.Series(lengths, name='length')
s3 = pd.Series(seq, name='seq')
Qfasta = pd.DataFrame(dict(ID=s1, length=s2)).set_index(['ID'])
this is the error that happens with the second code and I don't know why this happens
IndexError Traceback (most recent call last)
in <module>
12 SeqIO.write(records, output_file, "fasta")
13
---> 14 sort_fasta(input_file, output_file)
in sort_fasta(input_file, output_file)
8 def get_data(id_name):
9 return (id_name.split("|")[2], seguid(id_name))
---> 10 dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_data)
11 records = (dict_fasta[i] for i in sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-')))))
12 SeqIO.write(records, output_file, "fasta")
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\SeqIO\__init__.py in index(filename, format, alphabet, key_function)
873 key_function,
874 )
--> 875 return _IndexedSeqFileDict(
876 proxy_class(filename, format), key_function, repr, "SeqRecord"
877 )
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\File.py in __init__(self, random_access_proxy, key_function, repr, obj_repr)
185 offset_iter = random_access_proxy
186 offsets = {}
--> 187 for key, offset, length in offset_iter:
188 # Note - we don't store the length because I want to minimise the
189 # memory requirements. With the SQLite backend the length is kept
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\File.py in <genexpr>(.0)
181 self._obj_repr = obj_repr
182 if key_function:
--> 183 offset_iter = ((key_function(k), o, l) for (k, o, l) in random_access_proxy)
184 else:
185 offset_iter = random_access_proxy
in get_data(id_name)
7 def sort_fasta(input_file, output_file):
8 def get_data(id_name):
----> 9 return (id_name.split("|")[2], seguid(id_name))
10 dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_data)
11 records = (dict_fasta[i] for i in sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-')))))
what should I do about this?
With the following code the fasta entries in the input file are sorted and saved in the output file using the SeqIO index function. So, the function should also work with file having a big size that cannot be fitted in memory.
import re
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
input_file = "fasta.fasta"
output_file = "out.fasta"
def sort_fasta(input_file: str, output_file: str) -> None:
def get_index_key(id_name: str) -> tuple:
try:
key = (re.search(r'\d{4}-\d{2}-\d{2}', id_name).group(), seguid(id_name))
except AttributeError:
key = ('0001-01-01', seguid(id_name))
return key
dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_index_key)
sorted_keys_by_date = sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-'))))
records = (dict_fasta[i] for i in sorted_keys_by_date if i[0] != '0001-01-01')
SeqIO.write(records, output_file, "fasta")
sort_fasta(input_file, output_file)
You can split the string on \n>, and sort on the extracted date using a combination of sorted and re.search to set the date as key.
Use reverse=True as option for sorted to get the most recent date first.
I am assuming the string fasta as input here.
import re
sorted_fasta = ('>'+'\n>'.join(sorted(fasta[1:].strip().split('\n>'),
key=lambda s: re.search(r'\|\d{4}-\d{2}-\d{2}\|',
s).group()
)
)
)
example input:
>xxx|2020-12-30|xxx
NNN
>yyy|2020-12-23|yyy
NNN
>zzz|2021-01-22|zzz
NNN
matching output:
>yyy|2020-12-23|yyy
NNN
>xxx|2020-12-30|xxx
NNN
>zzz|2021-01-22|zzz
NNN
I try to get data from google trends in a g sheet. First time it runned smoothly, second time not so much. I got an error called:
ValueError: No objects to concatenate
I searched this error on Stack Overflow before but couldn't find any solutions. I use the code displayed below:
!pip install Pytrends
!pip install pandas
!pip install pytrends --upgrade <---------Note: this solved a different error.
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='nl-NL', tz=360)
df = wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1QE1QilM-GDdQle6eVunepqG5RNWv39xO0By84C19Ehc/edit?usp=sharing')
sheet = wb.sheet1
df2 = sheet.col_values(5)
d_from = sheet.acell('B7').value
d_to = sheet.acell('B8').value
geo1 = sheet.acell('B10').value
dataset = []
for x in range(1,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe= str(d_from + " " + d_to),
geo= str(geo1))
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
dataset.append(data)
result = pd.concat(dataset, axis=1)
result.to_csv('search_trends_DOWNLOAD_ME.csv')
!cp search_trends_DOWNLOAD_ME.csv "/content/drive/My Drive/Colab Notebooks/Output"
executionTime = (time.time() - startTime)
print('Execution time in sec.: ' + str(executionTime))
The error I got:
ValueError Traceback (most recent call last)
<ipython-input-5-b86c7b4df727> in <module>()
25 data = data.drop(labels=['isPartial'],axis='columns')
26 dataset.append(data)
---> 27 result = pd.concat(dataset, axis=1)
28 result.to_csv('search_trends_DOWNLOAD_ME.csv')
29 get_ipython().system('cp search_trends_DOWNLOAD_ME.csv "/content/drive/My Drive/Colab Notebooks/Output"')
1 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/reshape/concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327
328 if len(objs) == 0:
--> 329 raise ValueError("No objects to concatenate")
330
331 if keys is None:
ValueError: No objects to concatenate
The keywords I use are located in df = wb = gc.open_by_url. It is a g-sheet with the location, language and the keywords.
this happened to me earlier, it was just miss typing path\url of the file.
check the path again.
I am using this function to pull data from the Cryptocompare website into a pandas dataframe:
def daily_price_historical(symbol, comparison_symbol='USD', limit=1, aggregate=1, exchange='', allData='true'):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}&allData={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate, allData)
if exchange:
url += '&e={}'.format(exchange)
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
df.set_index('timestamp', inplace=True)
df['symbol'] = symbol
df['1dret'] = 100* df['close'].pct_change()
return df
This works fine for most symbols I pass in, but when I loop over a longer list of symbols I get the error: AttributeError: 'DataFrame' object has no attribute 'time'
I assume this is due to the API returning an error for certain symbols, e.g.:
https://min-api.cryptocompare.com/data/histoday?fsym=FAKE&tsym=USD
returns "Response":"Error" with no further data
I'm afraid I'm not very experienced with url requests/APIs. Is there code I can add to the function to skip the symbols that are causing the issue?
Thanks for your help!
Additional information:
Code used to loop over coins (which is a list of 130 symbols):
price_columns = ['close', 'high', 'low', 'open', 'time',
'volumefrom','volumeto', 'symbol', '1dret']
top_coin_prices = pd.DataFrame(columns=price_columns)
for coin in coins:
output = daily_price_historical(coin)
top_coin_prices = top_coin_prices.append(output)
Full Traceback:
AttributeError Traceback (most recent call last)
<ipython-input-277-126f5d1686b2> in <module>()
8 # populate df with data for all coins
9 for coin in coins:
---> 10 output = daily_price_historical(coin)
11 top_coin_prices = top_coin_prices.append(output)
12
<ipython-input-111-65b3fa76b4ab> in daily_price_historical(symbol, comparison_symbol, limit, aggregate, exchange, allData)
7 data = page.json()['Data']
8 df = pd.DataFrame(data)
----> 9 df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
10 df.set_index('timestamp', inplace=True)
11 df['symbol'] = symbol
/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
2968 if name in self._info_axis:
2969 return self[name]
-> 2970 return object.__getattribute__(self, name)
2971
2972 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'time'