Scrapy: Error 10054 after retrying image download

Scrapy: Error 10054 after retrying image download - python

I'm running a Scrapy spider in python to scrape images from a website. One of the images fails to download (even if I try to download it regularly through the site) which is an internal error for the site. This is fine, I don't care about trying to get the image, I just want to skip over the image when it fails and move onto the other images, but I keep getting a 10054 error.
> Traceback (most recent call last): File
> "c:\python27\lib\site-packages\twisted\internet\defer.py", line 588,
> in _runCallbacks
> current.result = callback(current.result, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line 137,
> in parse_photo_page
> self.retrievePhoto(base_url_photo + url[0], url_text) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 49, in wrapped_f
> return Retrying(*dargs, **dkw).call(f, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 212, in call
> raise attempt.get() File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 247, in get
> six.reraise(self.value[0], self.value[1], self.value[2]) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 200, in call
> attempt = Attempt(fn(*args, **kwargs), attempt_number, False) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line
> 216, in retrievePhoto
> code.write(f.read()) File "c:\python27\lib\socket.py", line 355, in read
> data = self._sock.recv(rbufsize) File "c:\python27\lib\httplib.py", line 612, in read
> s = self.fp.read(amt) File "c:\python27\lib\socket.py", line 384, in read
> data = self._sock.recv(left) error: [Errno 10054] An existing connection was forcibly closed by the remote
Here is my parse function that looks at the photo page and finds the important url's:
def parse_photo_page(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('td/font/a/#href').extract()
table_fields = sel.xpath('td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
Here is my download function with retry decorator:
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
It retries the download 5 times, but then throws the 10054 error and does not continue to the next image. How can I get the spider to continue after retrying? Again, I don't care about downloading the problem image, I just want to skip over it.

It's correct that you shouldn't use urllib inside scrapy because it blocks everything. Try to read resources related to "scrapy twisted" and "scrapy asynchronous". Anyway... I don't believe that your main problem is with "continue after retrying" but with not using "relevant xpaths" on your expressions. Here is a version that works for me (Note the ./ in './td/font/a/#href'):
import scrapy
import string
import urllib
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
def parse(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
And here's a (much better) version that follows your patterns but uses ImagesPipeline that #paul trmbrth mentioned.
import scrapy
import string
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider2"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
custom_settings = {
"ITEM_PIPELINES": {'scrapy.pipelines.images.ImagesPipeline': 1},
"IMAGES_STORE": saveLocation
}
def parse(self, response):
image_urls = []
image_texts = []
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
image_urls.append(base_url_photo + url[0])
image_texts.append(url_text)
return {"image_urls": image_urls, "image_texts": image_texts}
The demo file I use is this:
$ cat index.html
<table id="tblData"><tr>
<td><font>hi foo <span /> <span /> green.jpg </font></td>
</tr><tr>
<td><font>hi foo <span /> <span /> blue.jpg </font></td>
</tr></table>

Related

python memoryerror - large loop xml to mongodb

I downloaded a zip file from https://clinicaltrials.gov/AllPublicXML.zip, which contains over 200k xml files (most are < 10 kb in size), to a directory (see 'dirpath_zip' in the CODE) I created in ubuntu 16.04 (using DigitalOcean). What I'm trying to accomplish is loading all of these into MongoDB (also installed in the same location as the zip file).
I ran the CODE below twice and consistently failed when processing the 15988th file.
I've googled around and tried reading other posts regarding this particular error, but couldn't find a way to solve this particular issue. Actually, I'm not really sure what problem really is... any help is much appreciated!!
CODE:
import re
import json
import zipfile
import pymongo
import datetime
import xmltodict
from bs4 import BeautifulSoup
from pprint import pprint as ppt
def timestamper(stamp_type="regular"):
if stamp_type == "regular":
timestamp = str(datetime.datetime.now())
elif stamp_type == "filename":
timestamp = str(datetime.datetime.now()).replace("-", "").replace(":", "").replace(" ", "_")[:15]
else:
sys.exit("ERROR [timestamper()]: unexpected 'stamp_type' (parameter) encountered")
return timestamp
client = pymongo.MongoClient()
db = client['ctgov']
coll_name = "ts_"+timestamper(stamp_type="filename")
coll = db[coll_name]
dirpath_zip = '/glbdat/ctgov/all/alltrials_20180402.zip'
z = zipfile.ZipFile(dirpath_zip, 'r')
i = 0
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue
else:
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
i+=1
ERROR MESSAGE:
Traceback (most recent call last):
File "zip_to_mongo_alltrials.py", line 38, in <module>
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
File "/usr/local/lib/python3.5/dist-packages/bs4/__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.5/dist-packages/bs4/builder/_lxml.py", line 118, in prepare_markup
for encoding in detector.encodings:
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "/usr/lib/python3/dist-packages/chardet/__init__.py", line 30, in detect
u.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/universaldetector.py", line 128, in feed
if prober.feed(aBuf) == constants.eFoundIt:
File "/usr/lib/python3/dist-packages/chardet/charsetgroupprober.py", line 64, in feed
st = prober.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/hebrewprober.py", line 224, in feed
aBuf = self.filter_high_bit_only(aBuf)
File "/usr/lib/python3/dist-packages/chardet/charsetprober.py", line 53, in filter_high_bit_only
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
File "/usr/lib/python3.5/re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError

Try to push reading from file and inserting into db in another method.
Also add gc.collect() for garbage collection.
import gc;
def read_xml_insert(xmlfile):
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue;
else:
read_xml_insert(xmlfile);
i+=1
gc.collect()
`
Please see.

Python requests stops working mid-file

I've got a csv file with URL's and I need to scrape metadata from those website. I'm using python requests for that reasons with code below:
from tempfile import NamedTemporaryFile
import shutil
import csv
from bs4 import BeautifulSoup
import requests
import re
import html5lib
import sys
#import logging
filename = 'TestWTF.csv'
#logging.basicConfig(level=logging.DEBUG)
#Get filename (with extension) from terminal
#filename = sys.argv[1]
tempfile = NamedTemporaryFile(delete=False)
read_timeout = 1.0
#Does actual scraping done, returns metaTag data
def getMetadata (url, metaTag):
r = requests.get("http://" + url, timeout=2)
data = r.text
soup = BeautifulSoup(data, 'html5lib')
metadata = soup.findAll(attrs={"name":metaTag})
return metadata
#Gets either keyword or description
def addDescription ( row ):
scrapedKeywordsData = getMetadata(row, 'keywords')
if not scrapedKeywordsData:
print row + ' NO KEYWORDS'
scrapedKeywordsData = getMetadata(row, 'description')
if not scrapedKeywordsData:
return ''
return scrapedKeywordsData[0]
def prepareString ( data ):
output = data
#Get rid of opening meta content
if output.startswith( '<meta content="' ):
output = data[15:]
#Get rid of closing meta content (keywords)
if output.endswith( '" name="keywords"/>' ):
output = output[:-19]
#Get rid of closing meta content (description)
if output.endswith( '" name="description"/>' ):
output = output[:-22]
return output
def iterator():
with open(filename, 'rb') as csvFile, tempfile:
reader = csv.reader(csvFile, delimiter=',', quotechar='"')
writer = csv.writer(tempfile, delimiter=',', quotechar='"')
i = 0
for row in reader:
try:
data = str(addDescription (row[1] ))
row[3] = prepareString( data )
except requests.exceptions.RequestException as e:
print e
except requests.exceptions.Timeout as e:
print e
except requests.exceptions.ReadTimeout as e:
print "lol"
except requests.exceptions.ConnectionError as e:
print "These aren't the domains we're looking for."
except requests.exceptions.ConnectTimeout as e:
print "Too slow Mojo!"
writer.writerow(row)
i = i + 1
print i
shutil.move(tempfile.name, filename)
def main():
iterator()
#Defining main function
if __name__ == '__main__':
main()
It works just fine but at some URL's (out of 3000 let's say maybe 2-3) it would just suddenly stop working and not progress to next one after timeout time.. So I have to kill it using Ctr+C which results in file not being saved.
I know it's a problem of catching exceptions but I cannot figure out which one or what to do with that problem.. I'm more than happy to simply ignore the one which is stuck on..
EDIT:
Added traceback:
^CTraceback (most recent call last):
File "blacklist.py", line 90, in <module>
main()
File "blacklist.py", line 85, in main
iterator()
File "blacklist.py", line 62, in iterator
data = str(addDescription (row[1] ))
File "blacklist.py", line 30, in addDescription
scrapedKeywordsData = getMetadata(row, 'keywords')
File "blacklist.py", line 25, in getMetadata
metadata = soup.findAll(attrs={"name":metaTag})
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1259, in find_all
return self._find_all(name, attrs, text, limit, generator, **kwargs)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 537, in _find_all
found = strainer.search(i)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1654, in search
found = self.search_tag(markup)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1626, in search_tag
if not self._matches(attr_value, match_against):
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1696, in _matches
if isinstance(markup, Tag):
KeyboardInterrupt
EDIT 2:
Example website for which script doesn't work: miniusa.com

Cannot download image with relative URL Python Scrapy

I'm using Scrapy to download images from http://www.vesselfinder.com/vessels
However, I can only get the relative url of images like this http://www.vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0
All of the image named 0.jpg but if I try to use that absolute url, I cannot get access to the image.
My code:
items.py
import scrapy
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class VesselPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
vessel_spider.py
import scrapy
import string
from vessel.items import VesselItem
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = str(sel.xpath('div[1]/a/picture/img/#src').extract())
item['image_urls'] = self.page_name + imageStr[3:-2]
nameStr = str(sel.xpath('div[2]/header/h1/a/text()').extract())
item['name'] = nameStr[19:-8]
typeStr = str(sel.xpath('div[2]/div[2]/div[2]/text()').extract())
item['type'] = typeStr[3:-2]
return item
When I run this spider, I got the exceptions.ValueError: Missing scheme in request url: h error because I did not provide the absolute url.
[vessel] ERROR: Error processing {'image_urls': 'http://vesselfinder.com/vessels/ship-photo/0-224138470-a2fdc783d05a019d00ad9db0cef322f7/0.jpg',
'name': 'XILGARO ALEANTE',
'type': 'Sailing vessel'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 383, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 491, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 578, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/media.py", line 40, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/images.py", line 104, in get_media_requests
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 26, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
How should I fix this. Is there any special way for getting image (or its absolute url) with the site like this one.

Wrap your image url in a list like so:
item['image_urls'] = [self.page_name + imageStr[3:-2]]

I think the following code will do the trick (very few changes to your code),
vessel_spider.py
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = sel.xpath('./div[#class="small-12 medium-5 large-5 columns"]/a/picture/img/#src').extract()
imageStr = imageStr[0] if imageStr else 'N/A'
item['image_urls'] = [self.page_name + imageStr]
nameStr = sel.xpath('./div/header/h1[#class="subheader"]/a/text()').extract()
nameStr = ' '.join(' '.join(nameStr).split()) if nameStr else 'N/A'
item['name'] = nameStr
typeStr = sel.xpath('.//div[#class="small-4 columns" and contains(text(), "Ship type")]/following-sibling::div/text()').extract()
typeStr = typeStr[0].strip() if typeStr else 'N/A'
item['ship_type'] = typeStr
yield item
items.py
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
ship_type = scrapy.Field()
Appending the sample output,
{'image_urls': u'http://vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0',
'name': u'IBTISAM ATAO',
'ship_type': u'Sailing vessel'}

URLOpen Error while combining url with word from wordlist

Hey guys im making a Python Webcrawler at the Moment. So i have a link, which last chars are: "search?q=" and after that im using my wordlist which i have loaded before into a list. But when i try to open that with : urllib2.urlopen(url) it throws me an Error (urlopen error no host given) . But when i open that link with urllib normally (so typing the word which is normally automatic pasted in) it just works fine. So can you tell me why this is happening?
Thanks and regards
Full error:
File "C:/Users/David/PycharmProjects/GetAppResults/main.py", line 61, in <module>
getResults()
File "C:/Users/David/PycharmProjects/GetAppResults/main.py", line 40, in getResults
usock = urllib2.urlopen(url)
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 402, in open
req = meth(req)
File "C:\Python27\lib\urllib2.py", line 1113, in do_request_
raise URLError('no host given')
urllib2.URLError: <urlopen error no host given>
Code:
with open(filePath, "r") as ins:
wordList = []
for line in ins:
wordList.append(line)
def getResults():
packageID = ""
count = 0
word = "Test"
for x in wordList:
word = x;
print word
url = 'http://www.example.com/search?q=' + word
usock = urllib2.urlopen(url)
page_source = usock.read()
usock.close()
print page_source
startSequence = "data-docid=\""
endSequence = "\""
while page_source.find(startSequence) != -1:
start = page_source.find(startSequence) + len(startSequence)
end = page_source.find(endSequence, start)
print str(start);
print str(end);
link = page_source[start:end]
print link
if link:
if not link in packageID:
packageID += link + "\r\n"
print packageID
page_source = page_source[end + len(endSequence):]
count+=1
So when i print the string word it outputs the correct word from the wordlist

I solved the Problem. I simply using now the urrlib instead of urllib2 and anything works fine thank you all :)

Note that urlopen() returns a response, not a request.
You may have a broken proxy configuration; verify that your proxies are working:
print(urllib.request.getproxies())
or bypass proxy support altogether with:
url = urllib.request.urlopen(
"http://www.example.com/search?q="+text_to_check
proxies={})
Sample way to combining URL with word from Wordlist. It combines the list words to get the images from the url and downloads it. Loop it around to access the whole list you have.
import urllib
import re
print "The URL crawler starts.."
mylist =["http://www.ebay","https://www.npmjs.org/"]
wordlist = [".com","asss"]
x = 1
urlcontent = urllib.urlopen(mylist[0]+wordlist[0]).read()
imgUrls = re.findall('img .*?src="(.*?)"',urlcontent)
for imgUrl in imgUrls:
img = imgUrl
print img
urllib.urlretrieve(img,str(x)+".jpg")
x= x + 1
Hope this helps, else post your code and error logs.

Podio file upload fails in Python

I'm trying to do an upload to podio but it fails.
Following extract :
c = api.OAuthClient(
podio_pw.client_id,
podio_pw.client_secret,
podio_pw.username,
podio_pw.password,
source = "dit is een test"
attributes = {
'filename' : 'test.txt',
'source' : source
}
filep =
c.transport.POST(url='/file/v2/',body=attributes,type='multipart/form-data')
This results always in the following error.
Traceback (most recent call last):
File "C:\Python34\libs\podio-py-master\attach_invoices.py", line 43, in <module>
filep = c.transport.POST(url='/file/v2/',body=attributes,type='multipart/form-data')
File "C:\Python34\libs\podio-py-master\pypodio2\transport.py", line 135, in __call__
body = "".join(body)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 376, in __next__
return next(self)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 352, in __next__
block = next(self.param_iter)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 245, in iter_encode
block = self.encode(boundary)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 233, in encode
if re.search("^--%s$" % re.escape(boundary), value, re.M):
File "C:\Python34\lib\re.py", line 166, in search
return _compile(pattern, flags).search(string)
TypeError: can't use a string pattern on a bytes-like object
I know it has to do something with byte-encoding etc but I have no idea how to handle it. Even if I try to make that source a file, raw file or whatever, the POST fails.

This worked for me:
c = api.OAuthClient(
client_id,
client_secret,
username,
password,
)
filename = 'screenie.png'
filedata = open(filename, 'r')
"""Create a file from raw data"""
attributes = {'filename': filename,
'source': filedata}
file_upload = c.transport.POST(url='/file/v2/', body=attributes, type='multipart/form-data')
print(file_upload)
I lifted the code from here: https://github.com/podio/podio-py/blob/master/pypodio2/areas.py

To execute the file upload process in Python 3.*, You have to update two files in pypodio.
Step 1
Replace the file encode.py with the below script.
import urllib.request
import http.client
import mimetypes
import codecs
import uuid
import binascii
import io
import os
import sys
def multipart_encode(fields, files):
content_type, body = MultipartFormdataEncoder().encode(fields, files)
return body, content_type
class MultipartFormdataEncoder(object):
def __init__(self):
self.boundary = uuid.uuid4().hex
self.content_type = 'multipart/form-data; boundary={}'.format(self.boundary)
#classmethod
def u(cls, s):
if sys.hexversion < 0x03000000 and isinstance(s, str):
s = s.decode('utf-8')
if sys.hexversion >= 0x03000000 and isinstance(s, bytes):
s = s.decode('utf-8')
return s
def iter(self, fields, files):
"""
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, file-type) elements for data to be uploaded as files
Yield body's chunk as bytes
"""
encoder = codecs.getencoder('utf-8')
for (key, value) in fields:
key = self.u(key)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"\r\n').format(key))
yield encoder('\r\n')
if isinstance(value, int) or isinstance(value, float):
value = str(value)
yield encoder(self.u(value))
yield encoder('\r\n')
for (key, filename, fpath) in files:
key = self.u(key)
filename = self.u(filename)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"; filename="{}"\r\n').format(key, filename))
yield encoder(
'Content-Type: {}\r\n'.format(mimetypes.guess_type(filename)[0] or 'application/octet-stream'))
yield encoder('\r\n')
with open(fpath, 'rb') as fd:
buff = fd.read()
yield (buff, len(buff))
yield encoder('\r\n')
yield encoder('--{}--\r\n'.format(self.boundary))
def encode(self, fields, files):
body = io.BytesIO()
for chunk, chunk_len in self.iter(fields, files):
body.write(chunk)
return self.content_type, body.getvalue()
Code snippet from here
Step 2
Update transport.py, line no.186,
if kwargs['type'] == 'multipart/form-data':
fields = [('filename', kwargs['body']['filename'])]
files = [('source', kwargs['body']['filename'],kwargs['body']['source'])]
body, content_type = multipart_encode(fields,files)
headers.update({'Content-Type': content_type, })
else:

Returns "can only join an iterable error"
attributes={'filename': 'mx.txt', 'source': 'hello uyur92wyhfr ruptgpwyoer8t9u'}
try:
item = c.transport.POST(url=url,
body=attributes,
type='multipart/form-data')
except Exception as e:
print(e)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy: Error 10054 after retrying image download - python

Related

python memoryerror - large loop xml to mongodb

Python requests stops working mid-file

Cannot download image with relative URL Python Scrapy

URLOpen Error while combining url with word from wordlist

Podio file upload fails in Python

Categories

Resources