I have a python script that gets called by a PHP. The user that invokes this php script is apache and hence, the python file also gets invoked by apache. So, it gives "Unable to locate credentials ". I've set the default credentials via awscli and when I invoke the python script as root, it works.
This is my line of code :
client = boto3.client('ses', region_name=awsregion, aws_access_key_id='AJHHJHJHJ', aws_secret_access_key='asdasd/asdasd/asd')
But, this gives "Invalid Syntax" Error. So, I tried this :
client = boto3.Session(aws_access_key_id='ASDASD', aws_secret_access_key='asd/asdasd/asdasd')
client = boto3.client('ses', region_name=awsregion, aws_access_key_id='ASDASD', aws_secret_access_key='asd/asdasd/asdasd')
Gives the same error as above. Weird thing is that this same thing is mentioned in the documentation. Even though it's not recommended, it should work.
Can somebody help me in fixing this?
Did you ever get this resolved? Here is how I connect to boto3 in my Python scripts:
import boto3
from botocore.exceptions import ClientError
import re
from io import BytesIO
import gzip
import datetime
import dateutil.parser as dparser
from datetime import datetime
import tarfile
import requests
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## Needed glue stuff
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
##
## currently this will run for everything that is in the staging directory of omniture
# set needed parms
myProfileName = 'MyDataLake'
dhiBucket = 'data-lake'
#create boto3 session
try:
session = boto3.Session(aws_access_key_id='aaaaaaaaaaaa', aws_secret_access_key='abcdefghijklmnopqrstuvwxyz', region_name='us-east-1')aws_session_token=None, region_name=None, botocore_session=None
s3 = session.resource('s3') #establish connection to s3
except Exception as conne:
print ("Unable to connect: " + str(conne))
errtxt = requests.post("https://errorcapturesite", data={'message':'Unable to connect to : ' + myProfileName, 'notify':True,'color':'red'})
print(errtxt.text)
exit()
Related
I'm trying to get JSON objects from an S3 bucket using PySpark (on Windows, using wsl2 terminal).
I can do this using boto3 as an intermediate step but, when I try to use the spark.read.json method, I get an error.
Code:
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import os
import multiprocessing
#----------------APACHE CONFIGURATIONS--------------
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell'
#---------------spark--------------
conf = (
SparkConf()
.set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
.set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
.setAppName('pyspark_aws')
.setMaster(f"local[{multiprocessing.cpu_count()}]")
.setIfMissing("spark.executor.memory", "2g")
)
sc=SparkContext(conf=conf)
sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')
spark=SparkSession(sc)
#--------------hadoop--------------
accessKeyId='xxxxxxxxxxxx'
secretAccessKey='xxxxxxxxx'
hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set('fs.s3a.access.key', accessKeyId)
hadoopConf.set('fs.s3a.secret.key', secretAccessKey)
hadoopConf.set('fs.s3a.endpoint', 's3-eu-west-1.amazonaws.com')
hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
hadoopConf.set('fs.s3a.multipart.size', '419430400')
hadoopConf.set('fs.s3a.multipart.threshold', '2097152000')
hadoopConf.set('fs.s3a.connection.maximum', '500')
hadoopConf.set('s3a.connection.timeout', '600000')
s3_df = spark.read.json('s3a://{bucket}/{directory}/{object}.json')
Error:
py4j.protocol.Py4JJavaError: An error occurred while call
: java.lang.NumberFormatException: For input string: "32M
at java.base/java.lang.NumberFormatException.forI
at java.base/java.lang.Long.parseLong(Long.java:6
at java.base/java.lang.Long.parseLong(Long.java:8
at org.apache.hadoop.conf.Configuration.getLong(C
at org.apache.hadoop.fs.s3a.S3AFileSystem.getDefa
at org.apache.hadoop.fs.FileSystem.getDefaultBloc
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFile
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFile
at org.apache.hadoop.fs.FileSystem.exists(FileSys
at org.apache.spark.sql.execution.datasources.Dat
at org.apache.spark.sql.execution.datasources.Dat
at org.apache.spark.util.ThreadUtils$.$anonfun$pa
at java.base/java.util.concurrent.ForkJoinTask$Ruava.util.coteAction.exec(ForkJoinTask.java:1426)ncurrent.Fojava.base/java.util.concurrent.ForkJoinTask.dorkJoinWorkejava.base/java.util.concurrent.ForkJoinPool$WorThread.runjava.base/java.util.concurrent.ForkJoinPool.sc(ForkJoinWojava.base/java.util.concurrent.ForkJoinPool.rurkerThread.java.base/java.util.concurrent.ForkJoinWorkerTjava:183)
I added the multipart.size, multipart.threshold, connection.maximum, connection.timeout hadoop conf settings when I was getting a similar error earlier (this earlier error had '64M' instead of '32M' and changed when I added these conf settings)
I'm new to Spark so any and all tips/pointers would be helpful!
if needed
the "32M" is the default of "fs.s3a.block.size"
try hadoopConf.set('fs.s3a.block.size', '33554432')
go to https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html
you will find the explanations of the "32M" and the "64M"
I tried to connect to kafka topic using spark. It's not reading any data in its dstream or giving any error.
Here is my jupyter code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'
from pretty import pprint
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 60)
kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'topic_name':1})
kafkaStream.pprint()
Nothing gets printed. Also tried with createDirectStream but didn't get any output. Followed Spark Streaming not reading from Kafka topics and added PYTHONPATH but it didn't help either.
Any help would be deeply appreciated. Thanks!
It's not clear if you are sending any data., but you're not actually starting consumption
You'll need this at the end
ssc.start()
ssc.awaitTermination()
You need to add auto.offset.reset" : "smallest" in the createStream properties to read existing topic data.
from pyspark.streaming.kafka import KafkaUtils
directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"bootstrap-servers": brokers, "auto.offset.reset" : "smallest"})
As cricket_007 mentioned Structured Streaming is generally preferred. If you still want to handle it with directStream method sample as in below .
Note : Trying to read the message from topic 'topicname' and rewriting into another topic called 'compacttopic'
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
from kafka import SimpleProducer, KafkaClient
from kafka import KafkaProducer
producer = KafkaProducer(bootstrap_servers='localhost:9092')
def handler(message):
records = message.collect()
for record in records:
value_all=record[1]
value_spt=value_all.split('|')
value_key=value_spt[0]
print (value_key)
producer.send('compacttopic', key=str(value_key),value=str(record[1]))
producer.flush()
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 10)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, ['topicname'], {"metadata.broker.list": 'localhost:9092'})
kvs.foreachRDD(handler)
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
spark-submit command :
./bin/spark-submit --jars /Users/KarthikeyanDurairaj/jarfiles/spark-streaming-kafka-0-8-assembly_2.11-2.3.1.jar topictotopic.py localhost:9092 topicname
Note : Adjust the jar version based on your spark installed version .
Structured Streaming Approach :
You can refer the below stack overflow link for pyspark based Structured Streaming.
Failed to find leader for topics; java.lang.NullPointerException NullPointerException at org.apache.kafka.common.utils.Utils.formatAddress
this is the code I used.
from __future__ import print_function
import sys
from pyspark.sql import SparkSession
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import requests
if __name__ == "__main__":
s = Session()
toGet = s.get
spark = SparkSession\
.builder\
.appName("PythonDockerRepoStat")\
.getOrCreate()
lines = spark.read.text('/data/urls.txt').rdd.map(lambda r: r[0])
res = lines.flatMap(lambda x: x.split("\n"))\
.map(lambda x: toGet(x))
output = res.collect()
print(output)
However, I got this error: ImportError: No module named requests.sessions
When launching Spark jobs all dependencies have to be accessible for:
driver interpreter.
executor interpreter.
Extending path:
sys.path.append('/usr/local/lib/python2.7/site-packages')
will affect only local driver interpreter. To set executor environment variables you can:
modify $SPARK_HOME/conf/spark-env.sh
use spark.executorEnv.[EnvironmentVariableName] configuration option (for example by editing $SPARK_HOME/conf/spark-defaults.conf or setting corresponding SparkConf key.
At the same time you should make sure that requests is installed / accessible on every worker node (if not using local / pseudo-distributed mode).
I am trying to get my appengine application logs from remote.
I am using remote_api, I tried with appcfg but I discarded it because it has a limit on the download/buffer so I can't download all the logs.
Now I am using the logservice, but if I use it in my code it doesn't return anything.
Here is my code:
import time
import urllib2
from google.appengine.ext.remote_api import remote_api_stub
from google.appengine.api.logservice import logservice
import getpass
import base64
import os
from appcfg import *
import getpass
import subprocess
os.environ['HTTP_X_APPENGINE_TASKRETRYCOUNT']='1'
os.environ["SERVER_SOFTWARE"] = "Developement"
os.environ['HTTP_HOST'] = 'unitTest'
os.environ['CURRENT_MODULE_ID']='default'
os.environ['CURRENT_VERSION_ID']='1.0'
email_address = "iacopo#indiegala.com"
application_url = "store-indiegala.appspot.com"
def aut():
app_name = "store-indiegala.appspot.com"
f = lambda : ("*EMAIL*", "*PASSWORD*")
remote_api_stub.ConfigureRemoteApi(None, '/_ah/remote_api', auth_func,app_name)
print("successfully authenticated")
fetch_logs()
def fetch_logs():
end_time = time.time()
print ("starting")
for req_log in logservice.fetch(end_time = end_time, offset = None, minimum_log_level = logservice.LOG_LEVEL_INFO,
include_app_logs=True, include_incomplete=True):
print req_log.ip
def auth_func():
global email_address
return (email_address, getpass.getpass('Password:'))
aut()
It successfully connects to my app and he make the logservice.fetch(), but it returns an empty object... why?
Go to your logs in the App Engine admin and make sure you have the right module and version. They can be found in each log entry, for example:
2015-01-24 21:58:43.425 / active start=2015-01-24,13:57:36.773 AppEngine-Google; (+http://code.google.com/appengine) module=default version=baseline
Becomes:
import os
os.environ["CURRENT_MODULE_ID"] = "default"
os.environ['CURRENT_VERSION_ID']= "baseline"`
I am trying to get my appengine application logs from remote. I am using remote_api, I tried with appcfg but I discarded it because it has a limit on the download/buffer so I can't download all the logs.
Now I am using the logservice, but if I use it in my code it doesn't return anything. Here is my code:
import time
import urllib2
from google.appengine.ext.remote_api import remote_api_stub
from google.appengine.api.logservice import logservice
import getpass
import base64
import os
from appcfg import *
import getpass
import subprocess
os.environ['HTTP_X_APPENGINE_TASKRETRYCOUNT']='1'
os.environ["SERVER_SOFTWARE"] = "Developement"
os.environ['HTTP_HOST'] = 'unitTest'
os.environ['CURRENT_MODULE_ID']='default'
os.environ['CURRENT_VERSION_ID']='1.0'
email_address = "********"
application_url = "myappid.appspot.com"
def aut():
app_name = "myappid.appspot.com"
f = lambda : ("*EMAIL*", "*PASSWORD*")
remote_api_stub.ConfigureRemoteApi(None, '/_ah/remote_api', auth_func,app_name)
print("successfully authenticated")
fetch_logs()
def fetch_logs():
end_time = time.time()
print ("starting")
for req_log in logservice.fetch(end_time = end_time, offset = None, minimum_log_level = logservice.LOG_LEVEL_INFO,
include_app_logs=True, include_incomplete=True):
print req_log.ip
def auth_func():
global email_address
return (email_address, getpass.getpass('Password:'))
aut()
It successfully connects to my app and he make the logservice.fetch(), but it returns an empty object... why?