Pyspark cogroup rdd

Pyspark cogroup rdd - python

I'm new to pyspark and after 2 days of searching, I'm still don't understand what I'm doing wrong with cogroup.
This is what want to do:
I got a text file with a lot of words and each word has a value:
Hello 5
.
.
.
Hi 8
Ops 9
and I got another file that contains sentences
Hello my name is name
I want to calculate the value of the whole sentences according to the first file.
As you can see in the code I turned the first file to rdd that's looks like this:
[(Hi,8),...(Ops,9)]
For the second file I want to create rdd that looks like that:
[(Hello,1),...(Name,2)]
now when im trying to cogroup im getting this error:
AttributeError Traceback (most recent call last)
<ipython-input-3-c424da6be07f> in <module>
2 lines = ssc.textFileStream(dataDirectory)
3
----> 4 counts = lines.flatMap(lambda line: line.split(" ")) \
5 .map(lambda x: (x, 1)) \
6 .reduceByKey(lambda a, b: a + b) \
/usr/local/spark/spark/python/pyspark/streaming/dstream.py in cogroup(self, other, numPartitions)
350 if numPartitions is None:
351 numPartitions = self._sc.defaultParallelism
--> 352 return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other)
353
354 def join(self, other, numPartitions=None):
/usr/local/spark/spark/python/pyspark/streaming/dstream.py in transformWith(self, func, other, keepSerializer)
313 jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer)
314 dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
--> 315 other._jdstream.dstream(), jfunc)
316 jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer
317 return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
AttributeError: 'PipelinedRDD' object has no attribute '_jdstream'
This is my code:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
# Create Streaming Context with batch interval of 5 second.
ssc = StreamingContext(sc, 5)
# creating rdd for all the words in the dictionary file
text_file = sc.textFile('AFINN-111.txt')
def createPair(line):
x = line.replace("\t"," ").split(" ")
return (x[0],int(x[1]))
dictionary = text_file.map(createPair)
print(dictionary.take(20))
dataDirectory = 'FILES'
lines = ssc.textFileStream(dataDirectory)
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda x: (x, 1)) \
.reduceByKey(lambda a, b: a + b) \
.cogroup(dictionary)
counts.pprint()
# Start the computation
ssc.start()
ssc.awaitTermination()

Related

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required

So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft
data_sources = ['I:\Data Penelitian\Iphone/iphone.json']
def main():
spd = Spd(data_sources) #class instantiation
start = time.process_time()
relevant_tweets = spd.detector(data_sources)
stop = time.process_time()
return relevant_tweets
class Spd:
""" some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
def __init__(self, data_sources):
self.data_sources = data_sources
pass
# first function in the class:
def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
non_english_tweets = 0 # keep track of the non-English tweets
with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
for line in f.readlines():
non_English = 0
try:
line = json.loads(line)
if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
data_extracts['Language'].append(line['Language'])
data_extracts['TweetID'].append(line['TweetID'])
data_extracts['RawTweets'].append(line['RawTweets'])
data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
data_extracts['CreatedAt'].append(line['CreatedAt'])
data_extracts['AccountCreated'].append(line['AccountCreated'])
data_extracts['ScreenName'].append(line['ScreenName'])
data_extracts['RetweetCount'].append(line['RetweetCount'])
data_extracts['FollowersCount'].append(line['FollowersCount'])
data_extracts['FriendsCount'].append(line['FriendsCount'])
data_extracts['StatusesCount'].append(line['StatusesCount'])
data_extracts['FavouritesCount'].append(line['FavouritesCount'])
data_extracts['UserName'].append(line['UserName'])
data_extracts['Location'].append(line['Location'])
data_extracts['Description'].append(line['Description'])
data_extracts['UserURL'].append(line['UserURL'])
data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
data_extracts['UserID'].append(line['UserID'])
data_extracts['TimeZone'].append(line['TimeZone'])
data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
else:
non_english_tweets +=1
except:
continue
df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
AccountAge = [] # compute the account age of accounts
date_format = "%Y-%m-%d %H:%M:%S"
for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
#try:
dr = str(dr)
dc = str(dc)
d1 = datetime.strptime(dr,date_format)
d2 = datetime.strptime(dc,date_format)
dif = d1 - d2
AccountAge.append(dif.days)
#except:
#continue
df0['AccountAge']=AccountAge
# add/define additional features ...
df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
# Remove all RTs, set UserID as index and save relevant files:
df0 = df0[df0.Retweets.values==False] # remove retweets
df0 = df0.set_index('UserID')
df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
#df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv
with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
d.write('{}'.format(non_english_tweets))
d.close()
return df0
def detector(self, data_sources): # accept list of raw tweets as json objects
self.data_sources = data_sources
for data_sources in data_sources:
self.data_sources = data_sources
df0 = self.extractor(data_sources)
#drop fields not required for predicition
X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
X = X.dropna()
# reload the trained model for use:
spd_filter=pickle.load(open('trained_rf.pkl','rb'))
PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
#relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
#relevant_tweets = relevant_tweets.drop('UserID', axis=1)
# save files:
X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
spam.to_csv(data_source[:5]+'_spam_accounts.csv')
relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
return relevant_tweets # or return relevant_tweets, nonspam, spam
if __name__ =='__main__':
main()
The traceback error is as follow
ValueError Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
142
143 if __name__ =='__main__':
--> 144 main()
<ipython-input-2-5dc56f49d005> in main()
18 spd = Spd(data_sources) #class instantiation
19 start = time.process_time()
---> 20 relevant_tweets = spd.detector(data_sources)
21 stop = time.process_time()
22 return relevant_tweets
<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
126 # reload the trained model for use:
127 spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128 PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
129 X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
130 nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
543 The predicted classes.
544 """
--> 545 proba = self.predict_proba(X)
546
547 if self.n_outputs_ == 1:
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
586 check_is_fitted(self, 'estimators_')
587 # Check data
--> 588 X = self._validate_X_predict(X)
589
590 # Assign chunk of trees to jobs
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 #property
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
548 " minimum of %d is required%s."
549 % (n_samples, array.shape, ensure_min_samples,
--> 550 context))
551
552 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.

How to sort a FASTA file based on date?

I have a FASTA file that looks like this
>Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|Original|hCoV-19^^Hubei|Human|Wuhan Jinyintan Hospital|Wuhan Institute of Virology|Shi|China
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*
>Spike|hCoV-19/Philippines/PH-PGC-03696/2020|2020-12-23|EPI_ISL_2155626|Original|hCoV-19^^Central Luzon|Human|Research Institute for Tropical Medicine|Philippine Genome Center|Tablizo|Philippines
MFVFLVLLPLVFSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYYPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*
>Spike|hCoV-19/Belgium/UZA-UA-8350/2021|2021-01-22|EPI_ISL_940774|Original|hCoV-19^^Berchem|Human|Platform BIS UZA/UAntwerpen|UAntwerp|Xavier|Belgium
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNTVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAQHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCT*
I need to sort these sequences based on the date column, I found this code in stack overflow but it doesn't do the job for
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd
with open('F:/newone.fasta') as fasta_file:
identifiers = []
lengths = []
seq = []
for title, sequence in SimpleFastaParser(fasta_file):
identifiers.append(title.split(None, 3)[0])
lengths.append(len(sequence))
seq.append(sequence)
#converting lists to pandas Series
s1 = pd.Series(identifiers, name='ID')
s2 = pd.Series(lengths, name='length')
s3 = pd.Series(seq, name='seq')
Qfasta = pd.DataFrame(dict(ID=s1, length=s2)).set_index(['ID'])
this is the error that happens with the second code and I don't know why this happens
IndexError Traceback (most recent call last)
in <module>
12 SeqIO.write(records, output_file, "fasta")
13
---> 14 sort_fasta(input_file, output_file)
in sort_fasta(input_file, output_file)
8 def get_data(id_name):
9 return (id_name.split("|")[2], seguid(id_name))
---> 10 dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_data)
11 records = (dict_fasta[i] for i in sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-')))))
12 SeqIO.write(records, output_file, "fasta")
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\SeqIO\__init__.py in index(filename, format, alphabet, key_function)
873 key_function,
874 )
--> 875 return _IndexedSeqFileDict(
876 proxy_class(filename, format), key_function, repr, "SeqRecord"
877 )
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\File.py in __init__(self, random_access_proxy, key_function, repr, obj_repr)
185 offset_iter = random_access_proxy
186 offsets = {}
--> 187 for key, offset, length in offset_iter:
188 # Note - we don't store the length because I want to minimise the
189 # memory requirements. With the SQLite backend the length is kept
~\anaconda3\envs\deeplearning\lib\site-packages\Bio\File.py in <genexpr>(.0)
181 self._obj_repr = obj_repr
182 if key_function:
--> 183 offset_iter = ((key_function(k), o, l) for (k, o, l) in random_access_proxy)
184 else:
185 offset_iter = random_access_proxy
in get_data(id_name)
7 def sort_fasta(input_file, output_file):
8 def get_data(id_name):
----> 9 return (id_name.split("|")[2], seguid(id_name))
10 dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_data)
11 records = (dict_fasta[i] for i in sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-')))))
what should I do about this?

With the following code the fasta entries in the input file are sorted and saved in the output file using the SeqIO index function. So, the function should also work with file having a big size that cannot be fitted in memory.
import re
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
input_file = "fasta.fasta"
output_file = "out.fasta"
def sort_fasta(input_file: str, output_file: str) -> None:
def get_index_key(id_name: str) -> tuple:
try:
key = (re.search(r'\d{4}-\d{2}-\d{2}', id_name).group(), seguid(id_name))
except AttributeError:
key = ('0001-01-01', seguid(id_name))
return key
dict_fasta = SeqIO.index(input_file, "fasta", key_function=get_index_key)
sorted_keys_by_date = sorted(list(dict_fasta), reverse=True, key = lambda d: list(map(int, d[0].split('-'))))
records = (dict_fasta[i] for i in sorted_keys_by_date if i[0] != '0001-01-01')
SeqIO.write(records, output_file, "fasta")
sort_fasta(input_file, output_file)

You can split the string on \n>, and sort on the extracted date using a combination of sorted and re.search to set the date as key.
Use reverse=True as option for sorted to get the most recent date first.
I am assuming the string fasta as input here.
import re
sorted_fasta = ('>'+'\n>'.join(sorted(fasta[1:].strip().split('\n>'),
key=lambda s: re.search(r'\|\d{4}-\d{2}-\d{2}\|',
s).group()
)
)
)
example input:
>xxx|2020-12-30|xxx
NNN
>yyy|2020-12-23|yyy
NNN
>zzz|2021-01-22|zzz
NNN
matching output:
>yyy|2020-12-23|yyy
NNN
>xxx|2020-12-30|xxx
NNN
>zzz|2021-01-22|zzz
NNN

Error object of type 'NoneType' has no len() with json and csv

I'm new to the python language (python3.6). I try to have iris with longitude and latitude, but I have some errors that I can't correct.
What i want to achieve : I have coordinates (latitude and longitude) in a CSV file. In the other hand i have a geojsonfile where i have polygons shapes. I would like to see if my coordinates are contained in the range of each polygon of my file.
But i have some trouble with the code below, i don't understand the following error Error object of type 'NoneType' has no len() at the last block of code.
If my post miss some details please tell me, i'll be glad to add some informations to help you understand the situation :)
This is my code :
import json, csv
import numpy
from shapely.geometry import shape, Point
def readJson(url):
response = open(url)
return json.loads(response.read())
def readCSV(url):
response = open(url)
return csv.DictReader(response, delimiter=',')
def getIris():
"""
Returns a dictionary formed by the id of an iris and its coordinates.
"""
dict = {}
url = 'iris.json'
data2 = readJson(url)
for district in data2['features']:
dict[district['id']] = district['geometry']
return dict
def getPOIs():
"""
Returns a list of tuples of POIs lat/long coordinates.
"""
urls = [
"./result.csv",
]
POIs = []
for url in urls:
csv = readCSV(url)
for line in csv:
latitude = line.get('latitude', None)
longitude = line.get('longitude', None)
if latitude is not None and longitude is not None:
POIs.append((float(longitude), float(latitude)))
return POIs
def POIsInIris(iris, POIs):
"""
Returns a dictionary formed by the id of a iris and the number of POIs that falls in
this iris.
"""
dict = {}
for key, value in iris.items():
dict[key] = 0
polygon = shape(value)
for p in POIs:
point = Point(p[0], p[1])
# print point.wkt
if polygon.contains(point):
dict[key] += 1
return dict
if __name__ == '__main__':
# Geographical Features
iris_bbox = getIris()
iris_number = len(iris_bbox)
print ("Iris: ", iris_number)
print ("Reading POIs...")
POIs = getPOIs()
print (len(POIs))
print ("Done Reading POIs")
print ("Calculating POIs per Iris")
POIsPerIris = POIsInIris(iris_bbox, POIs)
for k,v in POIsPerIris.items():
print (k,v)
And the output :
Iris: 49404
Reading POIs...
0
Done Reading POIs
Calculating POIs per Iris
Moreover I have a 0 for print (len(POIs)), and I don't understand why.
Thank you a lot
Edit : Here is the full error message :
TypeError Traceback (most recent call last)
<ipython-input-55-247f0f9756f9> in <module>()
10 print ("Done Reading POIs")
11 print ("Calculating POIs per Iris")
---> 12 POIsPerIris = POIsInIris(iris_bbox, POIs)
13 for k,v in POIsPerIris.items():
14 print (k,v)
<ipython-input-54-0877c0182800> in POIsInIris(iris, POIs)
8 for key, value in iris.items():
9 dict[key] = 0
---> 10 polygon = shape(value)
11 for p in POIs:
12 point = Point(p[0], p[1])
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/geo.py in shape(context)
39 return MultiLineString(ob["coordinates"])
40 elif geom_type == "multipolygon":
---> 41 return MultiPolygon(ob["coordinates"], context_type='geojson')
42 elif geom_type == "geometrycollection":
43 geoms = [shape(g) for g in ob.get("geometries", [])]
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/multipolygon.py in __init__(self, polygons, context_type)
62 self._geom, self._ndim = geos_multipolygon_from_polygons(polygons)
63 elif context_type == 'geojson':
---> 64 self._geom, self._ndim = geos_multipolygon_from_py(polygons)
65
66 def shape_factory(self, *args):
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/multipolygon.py in geos_multipolygon_from_py(ob)
136 assert L >= 1
137
--> 138 N = len(ob[0][0][0])
139 assert N == 2 or N == 3
140
TypeError: object of type 'NoneType' has no len()

Python - Sentiment analysis receiving error : 'int' object is not iterable

I am running a sentiment analysis on a csv file and I am receiving this error message. I have tried a few things to resolve it and have not been successful. Any help would be greatly appreciated! Thank you!
Here is my code:
def sentimentAFINN(text):
words = pattern_split.split(text.lower())
sentiments = len(list(map(lambda word: afinn.get(word, 0), words)))
if sentiments:
sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
else:
sentiment = 0
return sentiment
def sentimentDisplayValue(sentimentScore):
if sentimentScore > 0.1:
return "Positive"
elif sentimentScore < -0.1:
return "Negative"
else:
return "Neutral"
totals = defaultdict(int)
for (index, row) in data.iterrows():
text = row['comment']
text_munged = munger(text)
sentimentScore = sentimentAFINN(text_munged)
sentimentDisplay = sentimentDisplayValue(sentimentScore)
totals[sentimentDisplay] = totals[sentimentDisplay] + 1
pt.add_row([text_munged, sentimentScore, sentimentDisplay])
print (pt)
print (totals)
This is my error message:
TypeError Traceback (most recent call last)
<ipython-input-73-b20887003b41> in <module>
4 text = row['LikelyToReferComment']
5 text_munged = munger(text)
----> 6 sentimentScore = sentimentAFINN(text_munged)
7 sentimentDisplay = sentimentDisplayValue(sentimentScore)
8 totals[sentimentDisplay] = totals[sentimentDisplay] + 1
<ipython-input-72-f95f79f94b60> in sentimentAFINN(text)
29 sentiments = len(list(map(lambda word: afinn.get(word, 0), words)))
30 if sentiments:
---> 31 sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
32
33 else:
TypeError: 'int' object is not iterable

Your sentiments variable is an int since its the value returned by len(). You are trying to call sum() and len() on sentiments. Both sum() and len() expect an iterable datatype.
You can change your sentimentAFINN() like this
def sentimentAFINN(text):
words = pattern_split.split(text.lower())
# save your list in sentiments
sentiments = list(map(lambda word: afinn.get(word, 0), words))
# now you check length of sentiments and return accordingly
return float(sum(sentiments))/math.sqrt(len(sentiments)) if len(sentiments) > 0 else 0

Python TypeError in graph_tool.find_vertex

I wrote a function which creates graph from list of edges (taken from database). I use graph-tool library. Python and this library are quite new for me.
Every vertex in graph should be described, by pair of string and number. In function, if I consider adding new vertex to the graph, first I check if the vertex with same properties exists in the graph. To do this I use find_vertex function. I don't understand why the TypeError occurs, please help. Here you have code of this function and traceback (below):
Edit: One thing more I have to deal with unicode strings.
def make_graph():
conn = get_connection()
cursor = conn.cursor()
cursor.execute(sql)
wordnetList = cursor.fetchall()
g= Graph(directed=False)
vprop = g.new_vertex_property("python::object")
g.vertex_properties['lexicalunit'] = vprop
for (hyponym, v1, hyperonym, v2) in wordnetList: # hyponym and v1 (string and integer respectively) are properties of first vertex, hyperonym and v2 for the second
matched1 = find_vertex(g, g.vp['lexicalunit'], (hyponym, v1)) # this is line with problem
if len(matched1) == 0:
ver1 = g.add_vertex()
vprop[ver1] = (hyponym, v1)
elif len(matched1) >= 1:
ver1 = matched1[0]
matched2 = find_vertex(g, g.vp['lexicalunit'], (hyperonym, v2))
if len(matched2) == 0:
ver2 = g.add_vertex()
vprop[ver2] = (hyperonym, v2)
elif len(matched2) >= 1:
ver2 = matched2[0]
g.add_edge(ver1, ver2)
return g
Traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/home/olorin/Dokumenty/nlp-rr/CRFRelationRecogniser/python/wordnet_explorer/<ipython-input-2-163ed9b92398> in <module>()
----> 1 grf = make_graph()
/home/olorin/Dokumenty/nlp-rr/CRFRelationRecogniser/python/wordnet_explorer/hypgraph.py in make_graph()
65 for (hyponym, v1, hyperonym, v2) in wordnetList:
66 print(hyponym, v1, hyperonym, v2)
---> 67 matched1 = find_vertex(g, g.vp['lexicalunit'], (hyponym, v1))
68 if len(matched1) == 0:
69 ver1 = g.add_vertex()
/usr/lib/python2.7/dist-packages/graph_tool/util/__init__.pyc in find_vertex(g, prop, match)
53 can be either a :class:`~graph_tool.PropertyMap` or string with value "in",
54 "out" or "total", representing a degree type."""
---> 55 val = _convert(prop, match)
56 ret = libgraph_tool_util.\
57 find_vertex_range(weakref.ref(g), _degree(g, prop),
/usr/lib/python2.7/dist-packages/graph_tool/__init__.pyc in _convert(prop, val)
232 if type(vtype) is tuple:
233 return [vtype[1](x) for x in val]
--> 234 return vtype(val)
235
236
TypeError: object.__new__() takes no parameters

This is a bug in graph-tool. It has been fixed now in the git version: http://git.skewed.de/graph-tool/commit/?id=566d6dd816e167e1c9e824961537301ee1527e14

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pyspark cogroup rdd - python

Related

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required

How to sort a FASTA file based on date?

Error object of type 'NoneType' has no len() with json and csv

Python - Sentiment analysis receiving error : 'int' object is not iterable

Python TypeError in graph_tool.find_vertex

Categories

Resources