How to download all transcripts from seeking alpha - python

Is there some way to automatically download all the transcripts from the SA website?
http://seekingalpha.com/earnings/earnings-call-transcripts
I tried using the http://newspaper.readthedocs.io/en/latest/ python code but I get the following error:
earnings_call_transcripts_2 = newspaper.build('http://seekingalpha.com/earnings/earnings-call-transcripts', memoize_articles=False)
Traceback (most recent call last):
File "/Users/name/anaconda/lib/python3.5/site-packages/newspaper/parsers.py", line 67, in fromstring
cls.doc = lxml.html.fromstring(html)
File "/Users/name/anaconda/lib/python3.5/site-packages/lxml/html/__init__.py", line 867, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/name/anaconda/lib/python3.5/site-packages/lxml/html/__init__.py", line 752, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "src/lxml/lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src/lxml/lxml.etree.c:77697)
File "src/lxml/parser.pxi", line 1819, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:116494)
File "src/lxml/parser.pxi", line 1700, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:115040)
File "src/lxml/parser.pxi", line 1040, in lxml.etree._BaseParser._parseUnicodeDoc (src/lxml/lxml.etree.c:109165)
File "src/lxml/parser.pxi", line 573, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:103404)
File "src/lxml/parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:105058)
File "src/lxml/parser.pxi", line 622, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:104143)
File "<string>", line None
lxml.etree.XMLSyntaxError: line 295: b"htmlParseEntityRef: expecting ';'"
[Source parse ERR] http://seekingalpha.com/earnings/earnings-call-transcripts

Related

lxml.etree.XMLSyntaxError while trying to read_excel using pandas

I've a spreadsheet (~50 mb) with multiple sheets, and I'm trying to read it using pandas.
import pandas as pd
df = pd.read_excel('compiled_output.xlsx', sheet_name='Sheet1')
I'm not sure why it's throwing lxml.etree.XMLSyntaxError; I've done this many times before. I also tried passing engine=openpyxl, downgraded to pandas==1.2.4 but I get the same error:
df = pd.read_excel('compiled_output.xlsx', sheet_name='Sheet1')
File "/usr/local/lib/python3.9/site-packages/pandas/util/_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 336, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 1131, in __init__
self._reader = self._engines[engine](self._io, storage_options=storage_options)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py", line 475, in __init__
super().__init__(filepath_or_buffer, storage_options=storage_options)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 391, in __init__
self.book = self.load_workbook(self.handles.handle)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py", line 486, in load_workbook
return load_workbook(
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 317, in load_workbook
reader.read()
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 282, in read
self.read_worksheets()
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 216, in read_worksheets
rels = get_dependents(self.archive, rels_path)
File "/usr/local/lib/python3.9/site-packages/openpyxl/packaging/relationship.py", line 131, in get_dependents
node = fromstring(src)
File "src/lxml/etree.pyx", line 3237, in lxml.etree.fromstring
File "src/lxml/parser.pxi", line 1896, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1784, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1141, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError
File "<string>", line 2
lxml.etree.XMLSyntaxError: internal error: Huge input lookup, line 2, column 12753697

Can etree.XMLParser in recover mode still throw a parse error?

I have a utility method that parses XML using a parser created as etree.XMLParser(recover=True). I would like to test failure scenarios in a unit test. Except for empty input throwing an lxml.etree.XMLSyntaxError, I can't seem to break the parser.
My question is: is it possible to construct a StringIO or BytesIO input for this parser such that the parser throws a parse error?
Here's some examples (tested with Python 3.5 and lxml 4.3.3):
from io import BytesIO
from lxml import etree
def parse(xml):
parser = etree.XMLParser(recover=True)
elem = etree.parse(BytesIO(xml), parser)
print(etree.tostring(elem))
parse(b'<broken<') # prints b'<broken/>'
parse(b'</lf|\jf>') # prints None
parse('<?xml encoding="ascii"?><foo>æøå</foo>'.encode('utf-8')) # prints b'<foo/>'
parse(b'') # Throws lxml.etree.XMLSyntaxError
If I slap a NULL character at the beginning of any of the bad inputs you show that don't raise an error, I do get an error. For instance:
parse(b'\0<broken<')
produces:
Traceback (most recent call last):
File "test.py", line 13, in <module>
parse(b'\0<broken<') # prints b'<broken/>'
File "test.py", line 9, in parse
elem = etree.parse(BytesIO(xml), parser)
File "src/lxml/etree.pyx", line 3435, in lxml.etree.parse
File "src/lxml/parser.pxi", line 1857, in lxml.etree._parseDocument
File "src/lxml/parser.pxi", line 1877, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1765, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
File "<string>", line 1
lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1
Isn't it because you are using recover=True?
recover - try hard to parse through broken XML
I changed recover=False and I get:
Traceback (most recent call last):
File "./foo.py", line 11, in <module>
parse(b'<broken<') # prints b'<broken/>'
File "./foo.py", line 7, in parse
elem = etree.parse(BytesIO(xml), parser)
File "src/lxml/etree.pyx", line 3435, in lxml.etree.parse
File "src/lxml/parser.pxi", line 1857, in lxml.etree._parseDocument
File "src/lxml/parser.pxi", line 1877, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1765, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
File "<string>", line 1
lxml.etree.XMLSyntaxError: error parsing attribute name, line 1, column 8
Am I missing something?

Reading Odoo Python Traceback

I am working on a odoo website, I am not familiar with odoo or python very much and while testing something, the Products page on backend has a XML syntax error I made changes through the backend's debug mode but I made sure to undo those changes before proceeding. Since this still might be the cause of the problem, I'm trying to figure out the file it was modifying.
Traceback (most recent call last):
File "/opt/odoo/odoo/http.py", line 638, in _handle_exception
return super(JsonRequest, self)._handle_exception(exception)
File "/opt/odoo/odoo/http.py", line 675, in dispatch
result = self._call_function(**self.params)
File "/opt/odoo/odoo/http.py", line 331, in _call_function
return checked_call(self.db, *args, **kwargs)
File "/opt/odoo/odoo/service/model.py", line 119, in wrapper
return f(dbname, *args, **kwargs)
File "/opt/odoo/odoo/http.py", line 324, in checked_call
result = self.endpoint(*a, **kw)
File "/opt/odoo/odoo/http.py", line 933, in __call__
return self.method(*args, **kw)
File "/opt/odoo/odoo/http.py", line 504, in response_wrap
response = f(*args, **kw)
File "/opt/odoo/addons/web/controllers/main.py", line 862, in call_kw
return self._call_kw(model, method, args, kwargs)
File "/opt/odoo/addons/web/controllers/main.py", line 854, in _call_kw
return call_kw(request.env[model], method, args, kwargs)
File "/opt/odoo/odoo/api.py", line 679, in call_kw
return call_kw_model(method, model, args, kwargs)
File "/opt/odoo/odoo/api.py", line 664, in call_kw_model
result = method(recs, *args, **kwargs)
File "/opt/odoo/odoo/models.py", line 1324, in load_views
for [v_id, v_type] in views
File "/opt/odoo/odoo/models.py", line 1324, in <dictcomp>
for [v_id, v_type] in views
File "/opt/odoo/addons/mail/models/mail_thread.py", line 363, in fields_view_get
res = super(MailThread, self).fields_view_get(view_id=view_id, view_type=view_type, toolbar=toolbar, submenu=submenu)
File "/opt/odoo/odoo/models.py", line 1383, in fields_view_get
root_view = View.browse(view_id).read_combined(['id', 'name', 'field_parent', 'type', 'model', 'arch'])
File "/opt/odoo/odoo/addons/base/ir/ir_ui_view.py", line 638, in read_combined
view_arch = etree.fromstring(view_data['arch'].encode('utf-8'))
File "lxml.etree.pyx", line 3032, in lxml.etree.fromstring (src/lxml/lxml.etree.c:68106)
File "parser.pxi", line 1785, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:102455)
File "parser.pxi", line 1673, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:101284)
File "parser.pxi", line 1074, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:96466)
File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:91275)
File "parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:92461)
File "parser.pxi", line 622, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:91757)
XMLSyntaxError: Blank needed here, line 1, column 19
The error trace shows that there is some syntactical error in an xml file.
After correcting the syntax you will have to upgrade the module.
You can either upgrade the module from the 'App' menu or using '-u' key followed by the name of the module when starting the server.
Thanks

Empty SVG file errror while it's not

Working with Pycairo, I create an "SVGSurface" (which create for me an svg file) and I write on it using a "context". Now after I finish, I need to use the svg file, but it seems that the file is not closed so that it gave me an error telling that the document is empty.
ps = cairo.SVGSurface("header.svg", width, height)
cr = cairo.Context(ps)
drawRectangle (cr,
papersize.convert_length(int(lg[0]), "px","pt"),
papersize.convert_length(int(lg[2]), "px","pt"),
papersize.convert_length(int(lg[1])-int(lg[0])-1, "px","pt"),
papersize.convert_length(int(lg[3])-int(lg[2])-1, "px","pt"),
0, 0, 0.5
)
cr.show_page()
head = st.fromfile("header.svg")
It gives me this error :
File "/usr/local/lib/python2.7/dist-packages/svgutils/transform.py", line 249, in fromfile
svg_file = etree.parse(fid)
File "src/lxml/lxml.etree.pyx", line 3427, in lxml.etree.parse (src/lxml/lxml.etree.c:81117)
File "src/lxml/parser.pxi", line 1832, in lxml.etree._parseDocument (src/lxml/lxml.etree.c:118116)
File "src/lxml/parser.pxi", line 1852, in lxml.etree._parseFilelikeDocument (src/lxml/lxml.etree.c:118399)
File "src/lxml/parser.pxi", line 1747, in lxml.etree._parseDocFromFilelike (src/lxml/lxml.etree.c:117187)
File "src/lxml/parser.pxi", line 1162, in lxml.etree._BaseParser._parseDocFromFilelike (src/lxml/lxml.etree.c:111914)
File "src/lxml/parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:105109)
File "src/lxml/parser.pxi", line 706, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:106817)
File "src/lxml/parser.pxi", line 635, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:105671)
lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1 (line 1)
I tried to close the file with os but it didn't work

Python Crawler - html.fromstring

I am trying to parse web page with this code.
ac = requests.get('link....')
html_text = ac.text
lx = html.fromstring(html_text)
When I run this code I am getting this error
Traceback (most recent call last):
File "Crawler.py", line 197, in <module>
cnx.close()
File "Crawler.py", line 46, in RequestPage
lx = html.fromstring(html_text)
File "C:\Python27\lib\site-packages\lxml\html\__init__.py", line 867, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "C:\Python27\lib\site-packages\lxml\html\__init__.py", line 752, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "src\lxml\lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src\lxml\lxml.etree.c:76696)
File "src\lxml\parser.pxi", line 1830, in lxml.etree._parseMemoryDocument (src\lxml\lxml.etree.c:115101)
File "src\lxml\parser.pxi", line 1711, in lxml.etree._parseDoc (src\lxml\lxml.etree.c:113677)
File "src\lxml\parser.pxi", line 1051, in lxml.etree._BaseParser._parseUnicodeDoc (src\lxml\lxml.etree.c:107847)
File "src\lxml\parser.pxi", line 584, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:102150)
File "src\lxml\parser.pxi", line 694, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:103800)
File "src\lxml\parser.pxi", line 633, in lxml.etree._raiseParseError (src\lxml\lxml.etree.c:102888)
lxml.etree.XMLSyntaxError: line 1843: Tag ie:menuitem invalid
I found the html tag which cause to the error:
<ie:menuitem id="MSOMenu_Help" iconsrc="/_layouts/images/HelpIcon.gif" onmenuclick="MSOWebPartPage_SetNewWindowLocation(MenuWebPart.getAttribute('helpLink'), MenuWebPart.getAttribute('helpMode'))" text="Help" type="option" style="display:none">
</ie:menuitem>
You found the HTML tag that case the error but did you fix it? If not try this:
ac = requests.get('link....')
lx = html.fromstring(ac.content)
valueOfHTMLTag = lx.xpath('//TAG[#class/id="Name"]/text()')
where you change:
TAG in the tag that you want to get the value of.
choose class or id of the tag
the id/class name of the tag
This will return an array with the values of that tag with the correct class/id.
Hope this helps!

Categories