Python: Unable to import module app in AWS Lambda

Python: Unable to import module app in AWS Lambda - python

I have the file app.py in the root of my app.zip file. And the function handler is also defined properly (lambda_handler), according to the handler config.: app.lambda_handler
Yet, I am getting the error: Unable to import module 'app': No module named app
Where did I go wrong?
My script:
from __future__ import print_function
import json
import urllib
import boto3
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
stemmer=PorterStemmer()
import sys
reload(sys)
sys.setdefaultencoding('utf8')
print('Loading function')
s3 = boto3.client('s3')
number_of_sentences=0
number_of_words=0
word_list=[]
stop_words=set(stopwords.words('english'))
stop_word_list=[ v for v in stop_words]
modal_verbs=['can', 'could', 'may', 'might', 'must', 'shall', 'should', 'will' ,'would','ought']
auxilary_verbs=['be','do','have']
stop_word_list=stop_word_list+modal_verbs+auxilary_verbs
print("Starting Trigram generation")
#Empty Trigram list
tri_gram_list=[]
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
'''
'''
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
text = response['Body'].read()
print(type(text))
for line in text.readlines():
for line in open("input.txt","r").readlines():
line=unicode(line, errors='ignore')
if len(line)>1:
sentences=sent_tokenize(line)
number_of_sentences+=len(sentences)
for sentence in sentences:
sentence=sentence.strip().lower()
#sentence = sentence.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ').replace('(', ' ').replace(')', ' ').replace(''`'', ' ').strip().lower()
words_from_sentence=tokenizer.tokenize(line)
words = [word for word in words_from_sentence if word not in stop_word_list]
number_of_words+=len(words)
stemmed_words = [stemmer.stem(word) for word in words]
word_list.extend(stemmed_words)
#generate Trigrams
tri_gram_list_t= [ " ".join([words[index],words[index+1],words[index+2]]) for index,value in enumerate(words) if index<len(words)-2]
#print tri_gram_list
tri_gram_list.extend(tri_gram_list_t)
print number_of_words
print number_of_sentences
print("Conting frequency now...")
count=Counter()
for element in tri_gram_list:
#print element, type(tri_gram_list)
count[element]=count[element]+1
print count.most_common(25)
print "most common 25 words ARE:"
for element in word_list:
#print element, type(tri_gram_list)
count[element]=count[element]+1
print count.most_common(25)
# body = obj.get()['Body'].read()
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Where did I go wrong?

Try checking the log output. It will give you more information than the error you saw above.
Finally, remember that you need the Python 2 syntax, replace calls like:
print number_of_words by print(number_of_words)

Related

Replace periods and commas with space in each file within the folder

I have a folder that contains a group of files, and each file contains a text string, periods, and commas. I want to replace the periods and commas with spaces and print all the files afterwards.
I used Replace, but this error appeared to me:
attributeError: 'list' object has no attribute 'replace'
How can i solve it?
codes.py:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
# 1-stop word processing
stop_words_list = stopwords.words('english')
additional_stopwords = []
with open("C:/Users/Super/Desktop/IR/homework/Lab4/IR Homework/stop words.txt", 'r') as file:
for word in file:
word = word.split('\n')
additional_stopwords.append(word[0])
stop_words_list += additional_stopwords
# --------------
# 2-tokenize and stemming
dir_path = 'C:/Users/Super/Desktop/IR/homework/Lab4/corpus/corpus/'
save_dir = "C:/Users/Super/Desktop/IR/homework/Files_Without_SW/"
for document in os.listdir(dir_path):
with open(dir_path + document, "r") as reader:
save_file = open(save_dir + document, 'w')
text = reader.read()
tokens_without_sw = [word for word in text if (word not in stop_words_list)]
cleaned = tokens_without_sw.replace(',', ' ')
cleaned = cleaned.replace('.', ' ')
ps = PorterStemmer()
text_tokens = word_tokenize(cleaned)
save_file.writelines(["%s " % item for item in text_tokens])
# cleaned = (" ").join(tokens_without_sw)
print(document, ':', tokens_without_sw)
with open("../Files/stemmer_words.txt", "a+") as stemFile:
for stemWord in tokens_without_sw:
stemFile.write(stemWord)
stemFile.write(":")
stemFile.write(ps.stem(stemWord))
stemFile.write('\n')

It seems you are trying to use the string function "replace" on a list. If your intention is to use it on all of the list's members, you can do it like so:
cleaned = [item.replace(',', ' ') for item in tokens_without_sw]
cleaned = [item.replace('.', ' ') for item in cleaned]
You can even take it one step forward and do both of the replaces at once, instead of doing two list comprehensions.
cleaned = [item.replace(',', ' ').replace('.', ' ') for item in tokens_without_sw]
Another way without list comprehensions was mentioned in the comments by Andreas.

how to Stemming Indonesian word with sastrawi

I did preprocessing the tweet data stored in the vaksinsampel2.csv file. And I have done several steps such as text cleaning, case folding, tokenizing, stopword removal, normalization. but I can't do stemming. please help me solve it.
here the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
df = pd.read_csv('vaksinsampel2.csv', encoding = 'unicode_escape')
def remove_punct(tweet):
tweet = re.sub('[^a-zA-Z0-9 ]', ' ', str(tweet))
tweet = re.sub('[0-9]+', ' ', tweet)
tweet = re.sub(r'#', '', str(tweet))
tweet = re.sub(r'http\S+', ' ', tweet)
return tweet
df['TEXT'] = df['full_text'].apply(lambda x:remove_punct(x))
df['case_folding'] = df['TEXT'].str.lower()
def tokenization(tweet):
tweet = re.split('\W+', tweet)
return tweet
df['Tokenization'] = df['TEXT'].apply(lambda x: tokenization(x.lower()))
df.head(10)
stopword = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tweet):
tweet = [word for word in tweet if word not in stopword]
return tweet
df['Stopword_Removal'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(10)
def normalisasi(tweet):
kamus_slangword = eval(open("slang_indonesia.txt").read()) # Membuka dictionary slangword
pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
content = []
for kata in tweet:
filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # Replace slangword berdasarkan pola review yg telah ditentukan
content.append(filteredSlang.lower())
tweet = content
return tweet
df['Normalization'] = df['Stopword_Removal'].apply(lambda x: normalisasi(x))
df.head(10)
factory = StemmerFactory()
stemming = factory.create_stemmer()
def stem_list(tweet):
return stemming.stem(df['Normalization'])
df['Stemming'] = df.apply(stem_list, axis=1)
df.head(50)

stemmer.stem() and not stemming.stem()?
I am referring to this: https://pypi.org/project/Sastrawi/
(...just happened to start exploring Sastrawi today)

Try This
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(tweet):
text = [stemmer.stem(word) for word in tweet]
return tweet
df['Stemming'] = df['Normalization'].apply(lambda x: stemming(x))

Train the model first and Test multiple times

I have been trying to use python's NLP script with my QT GUI based C++ application.
Basically in the application I am trying to access the NLP script through command line:
QString path = "D:/DS Project/Treegramming";
QString command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();
The above is working perfectly. but the problem is, it is taking about 40-50 seconds to execute, as it is first training the model and then testing.
But I want to train the model first and test it multiple times as we do in Jupyter Notebook.
for that I made a separate function for testing and trying to access it with command line:
PS D:\DS Project\Treegramming> py nlp.py "test('it was amazing')"
but again this thing is executing the whole script first and then executing the function. is there anything I can do to solve this?
python script:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 16:18:01 2019
#author: Muhammad Ahmed
"""
import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );

You need to create two python scripts:
First to train and save the NaiveBayesClassifier
Second to load and test the model.
To prevent repeating code, I will create a script for helpful functions and I will call it utils.py which should look like this:
import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
Then let's create the training script, I will call it train.py and it should look like this:
import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
pickle.dump(classifier, fout)
Finally, the test script test.py that should look like this:
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize
from utils import remove_noise
#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
classifier = pickle.load(fin)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
Now, run train.py once to train the Naive Bayes classifier that will create a new file called model.pickle that holds the trained classifier. Then run test.py from your C++ application on your custom tweet. test.py should loades the trained model model.pickle and use it on the given custom tweet.

Trying to print 3 results into a table using re

I've got a code that worked, up until I added the entropy portion to it. Now it's giving me an invalid syntax error on the print line. How come?
import nltk, math, re, numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(1) for l in freqdist]
return -sum(p * math.log(p,2) for p in probs)
def sents():
fileObj = open('1865-Lincoln.txt', 'r')
text = fileObj.read()
tokens = nltk.sent_tokenize(text)
for name in tokens:
words = ' '.join(name.split()[:4])
count = len(name.split())
entro = entropy(len(name.split())
print('{:<35} {:^15} {:>15}'.format(words, count, entro))

There is a closing bracket missing in the line above:
entro = entropy(len(name.split()))

NLTK package, not defined label

I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')

You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Unable to import module app in AWS Lambda - python

Try checking the log output. It will give you more information than the error you saw above. Finally, remember that you need the Python 2 syntax, replace calls like: print number_of_words by print(number_of_words)

Related

Replace periods and commas with space in each file within the folder

how to Stemming Indonesian word with sastrawi

Train the model first and Test multiple times

Trying to print 3 results into a table using re

NLTK package, not defined label

Categories

Resources