Scrapy upload file

Scrapy upload file - python

I am making a form request to a website using scrapy. The form requires to upload a pdf file, How can we do it in Scrapy. I am trying this like -
FormRequest(url,callback=self.parseSearchResponse,method="POST",formdata={'filename':'abc.xyz','file':'path to file/abc.xyz'})

At this very moment Scrapy has no built-in support for uploading files.
File uploading via forms in HTTP was specified in RFC1867. According to the spec, an HTTP request with Content-Type: multipart/form-data is required (in your code it would be application/x-www-form-urlencoded).
To achieve file uploading with Scrapy, you would need to:
Get familiar with the basic concepts of HTTP file uploading.
Start with scrapy.Request (instead of FormRequest).
Give it a proper Content-Type header value.
Build the request body yourself.
See also: How does HTTP file upload work?

I just spent an entire day trying to figure out how to implement this.
Finally, I came upon a Scrapy pull request from 2016 that was never merged, with an implementation of a multipart form request:
from scrapy import FormRequest
from six.moves.urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
import six
import string
import random
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
class MultipartFormRequest(FormRequest):
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
kwargs.setdefault('method', 'POST')
super(MultipartFormRequest, self).__init__(*args, **kwargs)
content_type = self.headers.setdefault(b'Content-Type', [b'multipart/form-data'])[0]
method = kwargs.get('method').upper()
if formdata and method == 'POST' and content_type == b'multipart/form-data':
items = formdata.items() if isinstance(formdata, dict) else formdata
self._boundary = ''
# encode the data using multipart spec
self._boundary = to_bytes(''.join(
random.choice(string.digits + string.ascii_letters) for i in range(20)), self.encoding)
self.headers[b'Content-Type'] = b'multipart/form-data; boundary=' + self._boundary
request_data = _multpart_encode(items, self._boundary, self.encoding)
self._set_body(request_data)
class MultipartFile(object):
def __init__(self, name, content, mimetype='application/octet-stream'):
self.name = name
self.content = content
self.mimetype = mimetype
def _get_form_url(form, url):
if url is None:
return urljoin(form.base_url, form.action)
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
def _multpart_encode(items, boundary, enc):
body = []
for name, value in items:
body.append(b'--' + boundary)
if isinstance(value, MultipartFile):
file_name = value.name
content = value.content
content_type = value.mimetype
body.append(
b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"; filename="' + to_bytes(file_name,
enc) + b'"')
body.append(b'Content-Type: ' + to_bytes(content_type, enc))
body.append(b'')
body.append(to_bytes(content, enc))
else:
body.append(b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"')
body.append(b'')
body.append(to_bytes(value, enc))
body.append(b'--' + boundary + b'--')
return b'\r\n'.join(body)
def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError("No <form> element found in %s" % response)
if formname is not None:
f = root.xpath('//form[#name="%s"]' % formname)
if f:
return f[0]
if formid is not None:
f = root.xpath('//form[#id="%s"]' % formid)
if f:
return f[0]
# Get form element from xpath, if not found, go up
if formxpath is not None:
nodes = root.xpath(formxpath)
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
return el
el = el.getparent()
if el is None:
break
encoded = formxpath if six.PY3 else formxpath.encode('unicode_escape')
raise ValueError('No <form> element found with %s' % encoded)
# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = forms[formnumber]
except IndexError:
raise IndexError("Form number %d not found in %s" %
(formnumber, response))
else:
return form
def _get_inputs(form, formdata, dont_click, clickdata, response):
try:
formdata = dict(formdata or ())
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(#type) or #type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../#checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={
"re": "http://exslt.org/regular-expressions"})
values = [(k, u'' if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata]
if not dont_click:
clickable = _get_clickable(clickdata, form)
if clickable and clickable[0] not in formdata and not clickable[0] is None:
values.append(clickable)
values.extend(formdata.items())
return values
def _value(ele):
n = ele.name
v = ele.value
if ele.tag == 'select':
return _select_value(ele, n, v)
return n, v
def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[#selected]')
v = [(o.get('value') or o.text or u'').strip() for o in selected_options]
return n, v
def _get_clickable(clickdata, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = [
el for el in form.xpath(
'descendant::*[(self::input or self::button)'
' and re:test(#type, "^submit$", "i")]'
'|descendant::button[not(#type)]',
namespaces={"re": "http://exslt.org/regular-expressions"})
]
if not clickables:
return
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = u'.//*' + \
u''.join(u'[#%s="%s"]' % c for c in six.iteritems(clickdata))
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError("Multiple elements found (%r) matching the criteria "
"in clickdata: %r" % (el, clickdata))
else:
raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
This is the code I used to call the request (in my case I needed to upload an image):
with open(img_path, 'rb') as file:
img = file.read()
file_name = os.path.basename(img_path)
multipart_file = MultipartFile(file_name, img, "image/png")
form_data = {
"param": "value", # this is an example of a text parameter
"PicUpload": multipart_file
}
yield MultipartFormRequest(url=upload_url, formdata=form_data,
callback=self.my_callback)
It's a shame that so much time has passed and Scrapy still doesn't have a built in way to do this, especially since someone wrote a very simple implementation years ago.

Related

How to put formatted text into the clipboard?

I wanted to create some formatted text like this:
𝐇𝐞𝐥𝐥𝐨 𝐖𝐨𝐫𝐥𝐝!
And then put it onto the clipboard using Python, so when I paste it will show my text in formatted version in Windows.
I have tried to copy some text using some libraries, but all they copy is the text in simple format:
Hello world
I want my clipboard text in bold format like this:
𝗵𝗲𝗹𝗹𝗼 𝘄𝗼𝗿𝗹𝗱

"""
Created on Sep 24, 2013
#author: jordans
Requires pywin32
original: http://code.activestate.com/recipes/474121/
# HtmlClipboard
# An interface to the "HTML Format" clipboard data format
__author__ = "Phillip Piper (jppx1[at]bigfoot.com)"
__date__ = "2006-02-21"
__version__ = "0.1"
"""
import re
import time
import random
import win32clipboard
#---------------------------------------------------------------------------
# Convenience functions to do the most common operation
def HasHtml():
"""
Return True if there is a Html fragment in the clipboard..
"""
cb = HtmlClipboard()
return cb.HasHtmlFormat()
def GetHtml():
"""
Return the Html fragment from the clipboard or None if there is no Html in the clipboard.
"""
cb = HtmlClipboard()
if cb.HasHtmlFormat():
return cb.GetFragment()
else:
return None
def PutHtml(fragment):
"""
Put the given fragment into the clipboard.
Convenience function to do the most common operation
"""
cb = HtmlClipboard()
cb.PutFragment(fragment)
#---------------------------------------------------------------------------
class HtmlClipboard:
CF_HTML = None
MARKER_BLOCK_OUTPUT = \
"Version:1.0\r\n" \
"StartHTML:%09d\r\n" \
"EndHTML:%09d\r\n" \
"StartFragment:%09d\r\n" \
"EndFragment:%09d\r\n" \
"StartSelection:%09d\r\n" \
"EndSelection:%09d\r\n" \
"SourceURL:%s\r\n"
MARKER_BLOCK_EX = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"StartSelection:(\d+)\s+" \
"EndSelection:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_EX_RE = re.compile(MARKER_BLOCK_EX)
MARKER_BLOCK = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_RE = re.compile(MARKER_BLOCK)
DEFAULT_HTML_BODY = \
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">" \
"<HTML><HEAD></HEAD><BODY><!--StartFragment-->%s<!--EndFragment--></BODY></HTML>"
def __init__(self):
self.html = None
self.fragment = None
self.selection = None
self.source = None
self.htmlClipboardVersion = None
def GetCfHtml(self):
"""
Return the FORMATID of the HTML format
"""
if self.CF_HTML is None:
self.CF_HTML = win32clipboard.RegisterClipboardFormat("HTML Format")
return self.CF_HTML
def GetAvailableFormats(self):
"""
Return a possibly empty list of formats available on the clipboard
"""
formats = []
try:
win32clipboard.OpenClipboard(0)
cf = win32clipboard.EnumClipboardFormats(0)
while (cf != 0):
formats.append(cf)
cf = win32clipboard.EnumClipboardFormats(cf)
finally:
win32clipboard.CloseClipboard()
return formats
def HasHtmlFormat(self):
"""
Return a boolean indicating if the clipboard has data in HTML format
"""
return (self.GetCfHtml() in self.GetAvailableFormats())
def GetFromClipboard(self):
"""
Read and decode the HTML from the clipboard
"""
# implement fix from: http://teachthe.net/?p=1137
cbOpened = False
while not cbOpened:
try:
win32clipboard.OpenClipboard(0)
src = win32clipboard.GetClipboardData(self.GetCfHtml())
src = src.decode("UTF-8")
#print(src)
self.DecodeClipboardSource(src)
cbOpened = True
win32clipboard.CloseClipboard()
except Exception as err:
# If access is denied, that means that the clipboard is in use.
# Keep trying until it's available.
if err.winerror == 5: # Access Denied
pass
# wait on clipboard because something else has it. we're waiting a
# random amount of time before we try again so we don't collide again
time.sleep( random.random()/50 )
elif err.winerror == 1418: # doesn't have board open
pass
elif err.winerror == 0: # open failure
pass
else:
print( 'ERROR in Clipboard section of readcomments: %s' % err)
pass
def DecodeClipboardSource(self, src):
"""
Decode the given string to figure out the details of the HTML that's on the string
"""
# Try the extended format first (which has an explicit selection)
matches = self.MARKER_BLOCK_EX_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.selection = src[int(matches.group(6)):int(matches.group(7))]
self.source = matches.group(8)
else:
# Failing that, try the version without a selection
matches = self.MARKER_BLOCK_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.source = matches.group(6)
self.selection = self.fragment
def GetHtml(self, refresh=False):
"""
Return the entire Html document
"""
if not self.html or refresh:
self.GetFromClipboard()
return self.html
def GetFragment(self, refresh=False):
"""
Return the Html fragment. A fragment is well-formated HTML enclosing the selected text
"""
if not self.fragment or refresh:
self.GetFromClipboard()
return self.fragment
def GetSelection(self, refresh=False):
"""
Return the part of the HTML that was selected. It might not be well-formed.
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.selection
def GetSource(self, refresh=False):
"""
Return the URL of the source of this HTML
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.source
def PutFragment(self, fragment, selection=None, html=None, source=None):
"""
Put the given well-formed fragment of Html into the clipboard.
selection, if given, must be a literal string within fragment.
html, if given, must be a well-formed Html document that textually
contains fragment and its required markers.
"""
if selection is None:
selection = fragment
if html is None:
html = self.DEFAULT_HTML_BODY % fragment
if source is None:
source = "file://HtmlClipboard.py"
fragmentStart = html.index(fragment)
fragmentEnd = fragmentStart + len(fragment)
selectionStart = html.index(selection)
selectionEnd = selectionStart + len(selection)
self.PutToClipboard(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
def PutToClipboard(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source="None"):
"""
Replace the Clipboard contents with the given html information.
"""
try:
win32clipboard.OpenClipboard(0)
win32clipboard.EmptyClipboard()
src = self.EncodeClipboardSource(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
src = src.encode("UTF-8")
#print(src)
win32clipboard.SetClipboardData(self.GetCfHtml(), src)
finally:
win32clipboard.CloseClipboard()
def EncodeClipboardSource(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source):
"""
Join all our bits of information into a string formatted as per the HTML format specs.
"""
# How long is the prefix going to be?
dummyPrefix = self.MARKER_BLOCK_OUTPUT % (0, 0, 0, 0, 0, 0, source)
lenPrefix = len(dummyPrefix)
prefix = self.MARKER_BLOCK_OUTPUT % (lenPrefix, len(html)+lenPrefix,
fragmentStart+lenPrefix, fragmentEnd+lenPrefix,
selectionStart+lenPrefix, selectionEnd+lenPrefix,
source)
return (prefix + html)
def DumpHtml():
cb = HtmlClipboard()
print("GetAvailableFormats()=%s" % str(cb.GetAvailableFormats()))
print("HasHtmlFormat()=%s" % str(cb.HasHtmlFormat()))
if cb.HasHtmlFormat():
cb.GetFromClipboard()
print("prefix=>>>%s<<<END" % cb.prefix)
print("htmlClipboardVersion=>>>%s<<<END" % cb.htmlClipboardVersion)
print("GetSelection()=>>>%s<<<END" % cb.GetSelection())
print("GetFragment()=>>>%s<<<END" % cb.GetFragment())
print("GetHtml()=>>>%s<<<END" % cb.GetHtml())
print("GetSource()=>>>%s<<<END" % cb.GetSource())
if __name__ == '__main__':
data = "<p>Writing to the clipboard is <strong>easy</strong> with this code.</p>"
PutHtml(data)
if GetHtml() == data:
print("passed")
print( GetHtml() )
else:
print("failed")
# DumpHtml()
from the question：
python 3.6 windows: retrieving the clipboard CF_HTML format - Stack Overflow
python 3.6 windows: retrieving the clipboard CF_HTML format

How to load web scraped data using Pandas and Beautifulsoup into Dataframe?

I have this code, which scrapes the Hacker News website with beautifulsoup4 and I am looking for a way to save the results into a Dataframe using Pandas. I have already imported pandas in the below code but I do not know how I can save the results into a DataFrame. It only scrapes the most favored Hacker News post now but it can be changed.
import pandas as pd
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from math import ceil
import json, sys, argparse, validators
MAX_NUM_POSTS = 100
class HackerNewsScraper:
URL = 'https://news.ycombinator.com/news'
def __init__(self, posts):
self._total_posts = posts
self._total_pages = int(ceil(posts/30))
self._stories = []
def scrape_stories(self):
"""
Fetches all HTML data.
Each page is limited to 30 stories, this function will ensure enough pages are fetched.
"""
page = 1
while(page <= self._total_pages): # Makes sure to visit sufficient amount of pages
url = '{}?p={}'.format(self.URL, page)
html = get_html(url)
self.parse_stories(html)
page += 1
def parse_stories(self, html):
"""
Given a BeautifulSoup nested data structure, html. parse_stories(html) will parse the data and select the desired fields.
After getting title, uri, author, comments, points, and rank, it will save them in dictionary form in self._stories.
"""
for storytext, subtext in zip(html.find_all('tr', {'class': 'athing'}),
html.find_all('td', {'class': 'subtext'})):
storylink = storytext.find_all('a',{'class':'storylink'})
sublink = subtext.select('a')
# All requested data being saved in the dictionary story below
TITLE = storylink[0].text.strip()
LINK = storylink[0]['href']
AUTHOR = sublink[0].text
COMMENTS = sublink[-1].text
POINTS = subtext.select('span')[0].text
RANK = storytext.select('span.rank')[0].text.strip('.')
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
# Make sure data satisfies requirements
story = validate_story(story)
# self._stories is an array of dictionaries that saves the requested number of stories
self._stories.append(story)
# If required number of stories met, stop parsing
if len(self._stories) >= self._total_posts:
return
def print_stories(self):
"""
Outputs the stories from list of dictionary format to JSON in STDOUT.
"""
json.dump(self._stories, sys.stdout, indent=4)
def get_stories(self):
"""
Returns the scraped stories to the user in a list of dictionary format.
Used for testing purposes.
"""
return self._stories
def get_html(url):
"""
Runs the HTML data through BeautifulSoup to get a BeautifulSoup object, a nested data structure.
"""
response = get_response(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
return html
def validate_story(story):
"""
Ensures that all the story data is valid according to the task.
Will return valid data for each field.
"""
story['title'] = story['title'][:256]
if not valid_title(story['title']):
story['title'] = 'Valid title not found'
story['author'] = story['author'][:256]
if not valid_author(story['author']):
story['author'] = 'Valid author not found'
if not valid_uri(story['uri']):
story['uri'] = 'Valid URI not found'
story['comments'] = validate_number(story['comments'])
story['points'] = validate_number(story['points'])
story['rank'] = validate_number(story['rank'])
return story
def valid_title(title):
"""
Ensures that title is non empty string with <= 256 characters
"""
return (len(title) <= 256 and title)
def valid_author(author):
"""
Ensures that author is non empty string and <= 256 characters.
Solved the issue of not finding an author by checking the fetched data with HN username rules.
"""
if(author.find(' ') > -1): #Hacker news username doesnt support whitespace
return False
return (len(author) <= 256 and author)
def valid_uri(url):
"""
To be able to find the scraped stories, we need their URL.
If data is not a valid URL, return False.
"""
if(validators.url(url)):
return True
return False
def validate_number(numString):
"""
Will make sure that the returned number is an integer.
Will strip any non digits from the input and return the first number.
"""
if numString.find('ago') > -1: #If not found, 'time since posted' would replace points for example
return 0
digits = [int(s) for s in numString.split() if s.isdigit()]
if len(digits) > 0:
return digits[0]
return 0
def get_response(url):
"""
Attempts to get the content at 'url' by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
Log the errors. Currently just printing them out to user.
"""
print(e)
def validate_input(arg, arg_max):
"""
Validate the user input. Makes sure it is less than or equal to 100 posts.
"""
error_msg = 'Posts cannot exceed {}'.format(arg_max)
if arg > arg_max:
raise argparse.ArgumentTypeError(error_msg)
# Parses the number of posts input from user. Default is 10.
def parse_arguments():
"""
Parses the argument input from the user. Default is 10.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--posts', '-p', metavar='n', type=int, default=1, help='number of posts (max 100)')
args = parser.parse_args()
validate_input(args.posts, MAX_NUM_POSTS)
return args.posts
def main():
"""
If user input is valid, will create a scraper and fetch requests number of posts and print them to the user.
"""
try:
posts = parse_arguments()
hnews_scraper = HackerNewsScraper(posts)
hnews_scraper.scrape_stories()
hnews_scraper.print_stories()
except argparse.ArgumentTypeError as ex:
log_error(ex)
if __name__ == '__main__':
main()

Try This:
Don't forget to import Pandas
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
data = list(zip(TITLE, LINK, AUTHOR, POINTS, COMMENTS, RANK))
dt = pd.DataFrame(data, columns = ['title', 'uri', 'author', 'points', 'comments', 'rank'])

Gettin HTML element and sending new json requests in python

I try to crawl this link by sending json requests. My first request would be :
parameters1 = {'ticker':'XOM', 'countryCode':'US',
'dateTime':'', 'docId':'1222737422 ',
'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
'messageNumber':'','count':'10',
'channelName':'/news/latest/company/us/xom', 'topic':'',
'_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))
for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?

import requests
import json
from bs4 import BeautifulSoup
def main():
html_url = 'http://www.marketwatch.com/investing/stock/xom'
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
unique_id = li_node['data-uniqueid']
print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = {
'ticker':'XOM',
'countryCode':'US',
'docType':'806',
'docId': '', # (Optional) initial value extract from HTML page
'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
'messageNumber':'8589', # initial value extract from HTML page
'count':'10',
'channelName': '/news/latest/company/us/xom',
}
parameters.update(extract_page_params(unique_id))
while True:
resp = requests.get(baseUrl, params = parameters)
data = json.loads(resp.text) # array of size 10
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("\ngot %d data, url: %s" % (len(data), resp.url))
print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
print("")
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
parameters.update(extract_page_params(uid))
input("press <enter> to get next")
def extract_page_params(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
if __name__ == '__main__':
main()
This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)

You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.

Instancemethod object is not iterable (AirPi) (Python)

I got ERROR:Exception during output: 'instancemethod' object is not iterable when debugging this AirPi code from https://github.com/haydnw/AirPi/blob/master/outputs/ubidots.py
This suppose to upload my sensor data to the Ubidots server.
*I'd put my correct token and variable ID inside the configuration file for this AirPi.
requiredSpecificParams = ["token"]
optionalSpecificParams = ["showcost",
"ID-BMP085-temp",
"ID-BMP085-pres",
"ID-DHT22-hum",
"ID-DHT22-temp",
"ID-LDR",
"ID-TGS2600",
"ID-MiCS-2710",
"ID-MiCS-5525",
"ID-Microphone",
"ID-Raingauge"
]
def __init__(self, config):
super(Ubidots, self).__init__(config)
self.token = self.params["token"]
if "showcost" in self.params:
self.showcost = self.params["showcost"]
else:
self.showcost = False
self.ubivariables = {}
for key, value in self.params.iteritems():
if key[:3] == "ID-":
if value:
self.ubivariables[key[3:]] = value
def output_data(self, datapoints, dummy):
"""Output data.
Output data in the format stipulated by the plugin. Calibration
is carried out first if required.
Because this particular plugin (ubidots) does not show time, the
third argument (normally called 'sampletime') is called 'dummy'
to facilitate compliance with pylint.
Args:
self: self.
datapoints: A dict containing the data to be output.
dummy: datetime representing the time the sample was taken.
Returns:
boolean True if data successfully output to Ubidots; False if
not
"""
if self.params["calibration"]:
datapoints = self.cal.calibrate(datapoints)
payload = []
for point in datapoints:
for ubivariablename, ubivariableid in self.ubivariables.iteritems():
if point["sensor"] == ubivariablename:
if point["value"] is not None:
thisvalue = {}
thisvalue["variable"] = ubivariableid
thisvalue["value"] = point["value"]
payload.append(thisvalue)
break
headers = {'Accept': 'application/json; indent=4', 'Content-Type': 'application/json', 'X-Auth-Token': self.token}
url = "http://things.ubidots.com/api/v1.6/collections/values"
req = None
cost = 0
try:
req = requests.post(url, data=json.dumps(payload), headers=headers)
except Exception, e:
print("ERROR: Failed to contact the Ubidots service.")
print("ERROR: " + str(e))
return False
for response in req.json:
if response["status_code"] is not 201:
print("ERROR: Ubidots responded with an error for one of the values.")
return False
else:
cost += 1
if self.showcost:
print("Ubidots upload cost " + str(cost) + " dots.")
return True

for response in req.json:
According to the documentation, json is a method and must be called, so this should be:
for response in req.json():
In the future it is helpful to include just as much of your code as is necessary to reproduce the problem, and to include the complete error message with traceback.

How can I implement dynamic routing in Python?

I'm attempting to implement dynamic routing for a web framework. At the moment, the goal is to pass arguments into a function by way of the url. So, if user offers a url of "/page/23", then the route function will extract the "23" which will then be used as a parameter for the page function. I am getting a "keyerror", however.
import re
routing_table = {}
url = "/page/23"
def route(url, func):
key = url
key = re.findall(r"(.+?)/<[a-zA-Z_][a-zA-Z0-9_]*>", url)
if key:
params = re.findall(r"<([a-zA-Z_][a-zA-Z0-9_]*)>", url)
routing_table[key[0]] = [params, func]
else:
routing_table[url] = func
def find_path(url):
if url in routing_table:
return routing_table[url]
else:
return None
def page(page_id):
return "this is page %d" % page_id
route("/page/<page_id>", page)
print(routing_table[url])

When you called route, you used a url equal to "/page/<page_id>", but in the last line, url is a global variable equal to "/page/23".
It looks like there are other problems: replace your last line with
print(routing_table)
to see what you're doing.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy upload file - python

I am making a form request to a website using scrapy. The form requires to upload a pdf file, How can we do it in Scrapy. I am trying this like - FormRequest(url,callback=self.parseSearchResponse,method="POST",formdata={'filename':'abc.xyz','file':'path to file/abc.xyz'})

Related

How to put formatted text into the clipboard?

How to load web scraped data using Pandas and Beautifulsoup into Dataframe?

Gettin HTML element and sending new json requests in python

Instancemethod object is not iterable (AirPi) (Python)

How can I implement dynamic routing in Python?

Categories

Resources