S3 the read operation timed out while reading commoncrawl data - python

In order to read few files from common crawl I have written this script
import warc
import boto
for line in sys.stdin:
line = line.strip()
#Connect to AWS and read a dataset
conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')
k = Key(pds)
k.key = line
f = warc.WARCFile(fileobj=GzipStreamFile(k))
skipped_doc = 0
for num, record in enumerate(f):
# analysis code
Where each line is the key of warc files. When I run this script to analyze 5 files, I got this exception
Traceback (most recent call last):
File "./warc_mapper_full.py", line 42, in <module>
for num, record in enumerate(f):
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 393, in __iter__
record = self.read_record()
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 364, in read_record
self.finish_reading_current_record()
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 358, in finish_reading_current_record
self.current_payload.read()
File "/usr/lib/python2.7/site-packages/warc/utils.py", line 59, in read
return self._read(self.length)
File "/usr/lib/python2.7/site-packages/warc/utils.py", line 69, in _read
content = self.buf + self.fileobj.read(size)
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 67, in read
result = super(GzipStreamFile, self).read(*args, **kwargs)
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 48, in readinto
data = self.read(len(b))
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 38, in read
raw = self.stream.read(io.DEFAULT_BUFFER_SIZE)
File "/usr/lib/python2.7/site-packages/boto/s3/key.py", line 400, in read
data = self.resp.read(size)
File "/usr/lib/python2.7/site-packages/boto/connection.py", line 413, in read
return http_client.HTTPResponse.read(self, amt)
File "/usr/lib64/python2.7/httplib.py", line 602, in read
s = self.fp.read(amt)
File "/usr/lib64/python2.7/socket.py", line 380, in read
data = self._sock.recv(left)
File "/usr/lib64/python2.7/ssl.py", line 736, in recv
return self.read(buflen)
File "/usr/lib64/python2.7/ssl.py", line 630, in read
v = self._sslobj.read(len or 1024)
ssl.SSLError: ('The read operation timed out',)
I run it many times. Above exception happened every time. Where is the problem ?

Related

Repl.it - cannot export table from python to excel

I currently work on a project to collect stock data and to export them in an excel worksheet as it is simply annoying to copy paste them.
Basically, when I run the script, everything is working well until .to_excel function.
I wondered if this library was working in Repl.it as I am working on it.
Just take a look to the code below:
from yahoofinancials import YahooFinancials
import yfinance as yf
from datetime import date, timedelta
from tabulate import tabulate
import pandas as pd
import numpy as np
import openpyxl
price = []
vol = []
tickers = ['AI.PA','AIR.PA','AMZN','AAPL','MT.AS','CS.PA','BNP.PA','^FCHI','CA.PA','CO.PA','ACA.PA','BN.PA','^GDAXI','KER.PA','OR.PA','MC.PA','^IXIC','NFLX','ORA.PA','RI.PA','RNO.PA','^GSPC','SAN.PA','GLE.PA','STLA.PA','STM.MI','TSLA','TTE.PA','FR.PA','BZ=F','AC.PA','AF.PA','SAF.PA']
def volatility(ticker):
stock_symbol = ticker
end_time = date.today()
start_time = end_time - timedelta(days=365)
end = end_time.strftime('%Y-%m-%d')
start = start_time.strftime('%Y-%m-%d')
json_prices = YahooFinancials(stock_symbol
).get_historical_price_data(start, end, 'daily')
prices = pd.DataFrame(json_prices[stock_symbol]
['prices'])[['formatted_date', 'close']]
prices.sort_index(ascending=False, inplace=True)
prices['returns'] = (np.log(prices.close /
prices.close.shift(-1)))
daily_std = np.std(prices.returns)
std = daily_std * 252 ** 0.5
df = yf.Ticker(ticker)
last_price = df.history(period='1d')['Close'][-1]
price.append(last_price)
vol.append(std)
for stock in tickers:
volatility(stock)
tb = {'TICKER':[tickers], 'Price':[price], '52w_vol':[vol]}
df2 = pd.DataFrame(tb)
df2.to_excel(r'path\Inliner_pricer.xlsm', sheet_name='Export_python', index = False)
Following is the error message :
Traceback (most recent call last):
File "main.py", line 47, in <module>
df2.to_excel(r'path/Inliner_pricer.xlsm', sheet_name='Export_python', index = False)
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/core/generic.py", line 2345
Traceback (most recent call last):
File "main.py", line 47, in <module>
df2.to_excel(r'path/Inliner_pricer.xlsm', sheet_name='Export_python', index = False)
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/core/generic.py", line 2345, in to_excel
formatter.write(
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/formats/excel.py", line 888, in write
writer = ExcelWriter( # type: ignore[abstract]
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/excel/_openpyxl.py", line 53, in __init__
super().__init__(
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/excel/_base.py", line 1106, in __init__
self.handles = get_handle(
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/common.py", line 670, in get_handle
ioargs = _get_filepath_or_buffer(
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/common.py", line 339, in _get_filepath_or_buffer
with urlopen(req_info) as req:
File "/home/runner/LovableLuxuriousCircle/venv/lib/python3.8/site-packages/pandas/io/common.py", line 239, in urlopen
return urllib.request.urlopen(*args, **kwargs)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 1397, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/urllib/request.py", line 1354, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/http/client.py", line 1256, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/http/client.py", line 1267, in _send_request
self.putrequest(method, url, **skips)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/http/client.py", line 1101, in putrequest
self._validate_path(url)
File "/nix/store/p21fdyxqb3yqflpim7g8s1mymgpnqiv7-python3-3.8.12/lib/python3.8/http/client.py", line 1201, in _validate_path
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. 'path\Inliner_pricer.xlsm' (found at least ' ')
Thanks in advance.

xlwings recently stopped getting live data from excel via Range

I was running a script to get data from excel for over a year using the Xlwings range command like so...
list=Range('A1:D10').value
Suddenly, it stopper working. I had changed nothing in the code nor the system, other than maybe installing another network card.
This is the error when trying to use the Range assignment now.
Traceback (most recent call last):
File "G:\python32\fetcher.py", line 61, in <module>
listFull = getComData()
File "G:\python32\fetcher.py", line 38, in getComData
listFull=Range('A4:H184').value
File "G:\python32\lib\site-packages\xlwings\main.py", line 1490, in __init__
impl = apps.active.range(cell1).impl
File "G:\python32\lib\site-packages\xlwings\main.py", line 439, in range
return Range(impl=self.impl.range(cell1, cell2))
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 457, in range
xl1 = self.xl.Range(arg1)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 341, in xl
self._xl = get_xl_app_from_hwnd(self._hwnd)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 251, in get_xl_app_from_hwnd
disp = COMRetryObjectWrapper(Dispatch(p))
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 96, in Dispatch
return __WrapDispatch(dispatch, userName, resultCLSID, typeinfo, clsctx=clsctx)
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 37, in __WrapDispatch
klass = gencache.GetClassForCLSID(resultCLSID)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 180, in GetClassForCLSID
mod = GetModuleForCLSID(clsid)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 223, in GetModuleForCLSID
mod = GetModuleForTypelib(typelibCLSID, lcid, major, minor)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 259, in GetModuleForTypelib
mod = _GetModule(modName)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 622, in _GetModule
mod = __import__(mod_name)
ValueError: source code string cannot contain null bytes

PicklingError when getting the result from ray

I'm working on slowly converting my very serialized text analysis engine to use Modin and Ray. Feels like I'm nearly there, however, I seem to have hit a stumbling block. My code looks like this:
vectorizer = TfidfVectorizer(
analyzer=ngrams, encoding="ascii", stop_words="english", strip_accents="ascii"
)
tf_idf_matrix = vectorizer.fit_transform(r_strings["name"])
r_vectorizer = ray.put(vectorizer)
r_tf_idf_matrix = ray.put(tf_idf_matrix)
n = 2
match_results = []
for fn in files["c.file"]:
match_results.append(
match_name.remote(fn, r_vectorizer, r_tf_idf_matrix, r_strings, n)
)
match_returns = ray.get(match_results)
I'm following the guidance from the "anti-patterns" section in the Ray documentation, on what to avoid, and this is very similar to that of the "better" pattern.
Traceback (most recent call last):
File "alt.py", line 213, in <module>
match_returns = ray.get(match_results)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 62, in wrapper
return func(*args, **kwargs)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/worker.py", line 1501, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PicklingError): ray::match_name() (pid=23393, ip=192.168.1.173)
File "python/ray/_raylet.pyx", line 564, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1652, in ray._raylet.CoreWorker.store_task_outputs
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 327, in serialize
return self._serialize_to_msgpack(value)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 307, in _serialize_to_msgpack
self._serialize_to_pickle5(metadata, python_objects)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 267, in _serialize_to_pickle5
raise e
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 264, in _serialize_to_pickle5
value, protocol=5, buffer_callback=writer.buffer_callback)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 580, in dump
return Pickler.dump(self, obj)
_pickle.PicklingError: args[0] from __newobj__ args has the wrong class
Definitely an unexpected result. I'm not sure where to go next with this and would appreciate help from folks who have more experience with Ray and Modin.

Python client for AWS Redis Cluster

Can anyone suggest a Python client for AWS Redis Cluster enabled?
I'm using redis-py-cluster, but it fails:
Sample code:
from rediscluster import StrictRedisCluster
startup_nodes = [{"host": "xxxx.clustercfg.apn2.cache.amazonaws.com", "port": "6379"}]
r = StrictRedisCluster(startup_nodes=startup_nodes, decode_responses=True, skip_full_coverage_check=False)
r.set('foo', 'bar')
value = r.get('foo')
======
Exception:
Traceback (most recent call last):
File "testRedisCluster.py", line 11, in
r = StrictRedisCluster(startup_nodes=startup_nodes, decode_responses=True, skip_full_coverage_check=False)
File "/Library/Python/2.7/site-packages/rediscluster/client.py", line 181, in init
**kwargs
File "/Library/Python/2.7/site-packages/rediscluster/connection.py", line 141, in init
self.nodes.initialize()
File "/Library/Python/2.7/site-packages/rediscluster/nodemanager.py", line 228, in initialize
need_full_slots_coverage = self.cluster_require_full_coverage(nodes_cache)
File "/Library/Python/2.7/site-packages/rediscluster/nodemanager.py", line 270, in cluster_require_full_coverage
return any(node_require_full_coverage(node) for node in nodes.values())
File "/Library/Python/2.7/site-packages/rediscluster/nodemanager.py", line 270, in
return any(node_require_full_coverage(node) for node in nodes.values())
File "/Library/Python/2.7/site-packages/rediscluster/nodemanager.py", line 267, in node_require_full_coverage
return "yes" in r_node.config_get("cluster-require-full-coverage").values()
File "/Library/Python/2.7/site-packages/redis/client.py", line 715, in config_get
return self.execute_command('CONFIG GET', pattern)
File "/Library/Python/2.7/site-packages/redis/client.py", line 668, in execute_command
return self.parse_response(connection, command_name, **options)
File "/Library/Python/2.7/site-packages/redis/client.py", line 680, in parse_response
response = connection.read_response()
File "/Library/Python/2.7/site-packages/redis/connection.py", line 629, in read_response
raise response
redis.exceptions.ResponseError: unknown command 'CONFIG'
I'm using redis-py-cluster 1.3.4.
Any idea?
Change the parameter skip_full_coverage_check=False to skip_full_coverage_check=True

Split PDF files in python - ValueError: invalid literal for int() with base 10: '' "

I am trying to split a huge pdf file into several small pdfs usinf pyPdf. I was trying with this oversimplified code:
from pyPdf import PdfFileWriter, PdfFileReader
inputpdf = PdfFileReader(file("document.pdf", "rb"))
for i in xrange(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
outputStream = file("document-page%s.pdf" % i, "wb")
output.write(outputStream)
outputStream.close()
but I got the follow error message:
Traceback (most recent call last):
File "./hltShortSummary.py", line 24, in <module>
for i in xrange(inputpdf.numPages):
File "/usr/lib/pymodules/python2.7/pyPdf/pdf.py", line 342, in <lambda>
numPages = property(lambda self: self.getNumPages(), None, None)
File "/usr/lib/pymodules/python2.7/pyPdf/pdf.py", line 334, in getNumPages
self._flatten()
File "/usr/lib/pymodules/python2.7/pyPdf/pdf.py", line 500, in _flatten
pages = catalog["/Pages"].getObject()
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 466, in __getitem__
return dict.__getitem__(self, key).getObject()
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 165, in getObject
return self.pdf.getObject(self).getObject()
File "/usr/lib/pymodules/python2.7/pyPdf/pdf.py", line 549, in getObject
retval = readObject(self.stream, self)
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 67, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 517, in readFromStream
value = readObject(stream, pdf)
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 58, in readObject
return ArrayObject.readFromStream(stream, pdf)
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 153, in readFromStream
arr.append(readObject(stream, pdf))
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 87, in readObject
return NumberObject.readFromStream(stream)
File "/usr/lib/pymodules/python2.7/pyPdf/generic.py", line 232, in readFromStream
return NumberObject(name)
ValueError: invalid literal for int() with base 10: ''
any ideas???
I think this is a bug in pypdf. Check out the source here. NumberObject.readFromStream expects an integer-like string, and isn't getting one. Probably the pdf in question is malformed in some unexpected way.
Try it this way
for i in xrange(inputpdf.getNumPages()):

Categories