how to get the text in one of the divs? (html) - python

I am writing my bot, which so far has to get the text from the div from one page and put it in a variable, but this does not work out and the variable always remains empty. How i can extract it?
import telebot;
import requests
from lxml import etree
import lxml.html
import csv
bot = telebot.TeleBot('');
#bot.message_handler(content_types=['text'])
def get_text_messages(message):
api = requests.get("https://slovardalja.net/word.php?wordid=21880")
tree = lxml.html.document_fromstring(api.text)
text_original = tree.xpath('/html/body/table/tbody/tr[2]/td/table/tbody/tr/td[2]/index/div[2]/p[1]/strong/text()')
print(text_original)
bot.send_message(message.chat.id,str(text_original))
bot.polling(none_stop=True, interval=0)
https://slovardalja.net/word.php?wordid=21880
I think this code should get the word "ОЛЕКВАС", I copied the path to it and added /text(), but it doesn't work

I have no cyrillic on my system, but with a smaller xpath value and the usage from text_content it print something on shell, hopefully it helps
api = requests.get("https://slovardalja.net/word.php?wordid=21880")
tree = lxml.html.document_fromstring(api.text)
text_original = tree.xpath('//div[#align="justify"]/p/strong')
print(text_original[0].text_content())

Related

Problem with lxml.xpath not putting elements into a list

So here's my problem. I'm trying to use lxml to web scrape a website and get some information but the elements that the information pertains to aren't being found when using the var.xpath command. It's finding the page but after using the xpath it doesn't find anything.
import requests
from lxml import html
def main():
result = requests.get('https://rocketleague.tracker.network/rocket-league/profile/xbl/ReedyOrange/overview')
# the root of the tracker website
page = html.fromstring(result.content)
print('its getting the element from here', page)
threesRank = page.xpath('//*[#id="app"]/div[2]/div[2]/div/main/div[2]/div[3]/div[1]/div/div/div[1]/div[2]/table/tbody/tr[*]/td[3]/div/div[2]/div[1]/div')
print('the 3s rank is: ', threesRank)
if __name__ == "__main__":
main()
OUTPUT:
"D:\Python projects\venv\Scripts\python.exe" "D:/Python projects/main.py"
its getting the element from here <Element html at 0x20eb01006d0>
the 3s rank is: []
Process finished with exit code 0
The output next to "the 3s rank is:" should look something like this
[<Element html at 0x20eb01006d0>, <Element html at 0x20eb01006d0>, <Element html at 0x20eb01006d0>]
Because the xpath string does not match, no result set is returned by page.xpath(..). It's difficult to say exactly what you are looking for but considering "threesRank" I assume you are looking for all the table values, ie. ranking and so on.
You can get a more accurate and self-explanatory xpath using the Chrome Addon "Xpath helper". Usage: enter the site and activate the extension. Hold down the shift key and hoover on the element you are interested in.
Since the HTML used by tracker.network.com is built dynamically using javascript with BootstrapVue (and Moment/Typeahead/jQuery) there is a big risk the dynamic rendering is producing different results from time to time.
Instead of scraping the rendered html, I suggest you instead use the structured data needed for the rendering, which in this case is stored as json in a JavaScript variable called __INITIAL_STATE__
import requests
import re
import json
from contextlib import suppress
# get page
result = requests.get('https://rocketleague.tracker.network/rocket-league/profile/xbl/ReedyOrange/overview')
# Extract everything needed to render the current page. Data is stored as Json in the
# JavaScript variable: window.__INITIAL_STATE__={"route":{"path":"\u0 ... }};
json_string = re.search(r"window.__INITIAL_STATE__\s?=\s?(\{.*?\});", result.text).group(1)
# convert text string to structured json data
rocketleague = json.loads(json_string)
# Save structured json data to a text file that helps you orient yourself and pick
# the parts you are interested in.
with open('rocketleague_json_data.txt', 'w') as outfile:
outfile.write(json.dumps(rocketleague, indent=4, sort_keys=True))
# Access members using names
print(rocketleague['titles']['currentTitle']['platforms'][0]['name'])
# To avoid 'KeyError' when a key is missing or index is out of range, use "with suppress"
# as in the example below: since there there is no platform no 99, the variable "platform99"
# will be unassigned without throwing a 'keyerror' exception.
from contextlib import suppress
with suppress(KeyError):
platform1 = rocketleague['titles']['currentTitle']['platforms'][0]['name']
platform99 = rocketleague['titles']['currentTitle']['platforms'][99]['name']
# print platforms used by currentTitle
for platform in rocketleague['titles']['currentTitle']['platforms']:
print(platform['name'])
# print all titles with corresponding platforms
for title in rocketleague['titles']['titles']:
print(f"\nTitle: {title['name']}")
for platform in title['platforms']:
print(f"\tPlatform: {platform['name']}")
lxml doesn't support "tbody". change your xpath to
'//*[#id="app"]/div[2]/div[2]/div/main/div[2]/div[3]/div[1]/div/div/div[1]/div[2]/table/tr[*]/td[3]/div/div[2]/div[1]/div'

Reading specific content from website and display using python

The code below is what i have currently done, but i am struggling to get it working properly...
hope you can help :)
#A python programme which shows the current price of bitcoin.
#(a well-known crypto-currency.)
import urllib
import urllib2
def webConnect():
aResp = urllib2.urlopen("https://www.cryptocompare.com/coins/btc/overview/GBP")
web_pg = aResp.read();
print web_pg
def main():
webConnect()
main()
g = Grab()
g.go(address)
btc_div = g.xpath('//*/div[class="ng-binding"]')
val = btc_div.xpath(u"dl/dt[contains(text(),'%s')]/../dd/text()" % 'if only that tag contains this text')
print val[0]
One option is to use beautifulsoup library.
This question has example of finding tags by text : BeautifulSoup - search by text inside a tag
Tutorial : https://www.dataquest.io/blog/web-scraping-tutorial-python/

Python - How to test similarity between strings and only print new strings?

I have developed a webscraper with beautiful soup that scrapes news from a website and then sends them to a telegram bot. Every time the program runs it picks up all the news currently on the news web page, and I want it to just pick the new entries on the news and send only those.
How can I do this? Should I use a sorting algorithm of some sort?
Here is the code:
#Lib requests
import requests
import bs4
fonte = requests.get('https://www.noticiasaominuto.com/')
soup = bs4.BeautifulSoup(fonte.text, 'lxml')
body = soup.body
for paragrafo in body.find_all('p', class_='article-thumb-text'):
print(paragrafo.text)
conteudo = paragrafo.text
id = requests.get('https://api.telegram.org/bot<TOKEN>/getUpdates')
chat_id = id.json()['result'][0]['message']['from']['id']
print(chat_id)
msg = requests.post('https://api.telegram.org/bot<TOKEN>/sendMessage', data = {'chat_id': chat_id ,'text' : conteudo})
You need to keep track of articles that you have seen before, either by using a full database solution or by simply saving the information in a file. The file needs to be read before starting. The website is then scraped and compared against the existing list. Any articles not in the list are added to the list. At the end, the updated list is saved back to the file.
Rather that storing the whole text in the file, a hash of the text can be saved instead. i.e. convert the text into a unique number, in this case a hex digest is used to make it easier to save to a text file. As each hash will be unique, they can be stored in a Python set to speed up the checking:
import hashlib
import requests
import bs4
import os
# Read in hashes of past articles
db = 'past.txt'
if os.path.exists(db):
with open(db) as f_past:
past_articles = set(f_past.read().splitlines())
else:
past_articles = set()
fonte = requests.get('https://www.noticiasaominuto.com/')
soup = bs4.BeautifulSoup(fonte.text, 'lxml')
for paragrafo in soup.body.find_all('p', class_='article-thumb-text'):
m = hashlib.md5(paragrafo.text.encode('utf-8'))
if m.hexdigest() not in past_articles:
print('New {} - {}'.format(m.hexdigest(), paragrafo.text))
past_articles.add(m.hexdigest())
# ...Update telegram here...
# Write updated hashes back to the file
with open(db, 'w') as f_past:
f_past.write('\n'.join(past_articles))
The first time this is run, all articles will be displayed. The next time, no articles will be displayed until the website is updated.

Python3 XML get text between tags

I have the following code in Python 3. I am using the import xml.etree.ElementTree as ET for XML parsing. the webScraper grab the text from an webside but on that website there is text between the <link></link> tag, but the program returns None. I can se that the program finds all tags but where the tag result should be printed it only says None.
result = webScrapper.scrappPart("http://www.dn.se/rss/senaste-nytt/", "body")
root = ET.fromstring(result)
for items in root.findall('.//item'):
link = items.find('link')
print(link.text)
Does anyone know how to fix this?
Since your URL is actually an RSS feed, you'd be much better off using an RSS feed parser on it, instead of trying to roll your own. Fortunately, this is why feedparser exists. Check this out:
import feedparser as fp
feed = fp.parse("http://www.dn.se/rss/senaste-nytt/")
for entry in feed["entries"]:
print(entry["link"])
This returns
http://www.dn.se/sport/fotboll/cavani-het-i-svalt-psg/
http://www.dn.se/sport/fotbolls-em/kompany-missar-em/
http://www.dn.se/nyheter/sverige/livvaktens-slakting-fick-praktik-hos-sahlin-trots-myndighetens-avslag/
http://www.dn.se/sport/st-louis-andraperiod-avgjorde/
http://www.dn.se/nyheter/varlden/syrien-spanska-journalister-fria/
http://www.dn.se/sport/dansk-dynamit-ska-stoppa-tre-kronor/
http://www.dn.se/nyheter/sverige/mordmisstankt-slappt-ur-haktet-1/
http://www.dn.se/nyheter/varlden/ekonomiprofessor-loste-ekvation-togs-for-terrorist/
http://www.dn.se/sport/fotboll/leicester-firade-med-storseger/
http://www.dn.se/ekonomi/protester-mot-ny-granskontroll-urartade/
http://www.dn.se/sport/ishockey-vm/jimmie-ericsson-jag-ar-beredd-gora-allt-for-att-vinna/
http://www.dn.se/sport/ishockey-vm/schweiz-straffat-av-kazakstan/
http://www.dn.se/nyheter/varlden/natosoldater-dodade-i-afghanistan-2/
http://www.dn.se/sport/forsta-matchen-till-eslov/
http://www.dn.se/nyheter/sverige/drunknad-man-hittad-av-dykare/
http://www.dn.se/ekonomi/tagstopp-efter-olycka/
http://www.dn.se/sport/kristianstad-till-sm-final/
http://www.dn.se/sthlm/en-person-attackerad-med-kniv-i-centrala-stockholm/
http://www.dn.se/nyheter/sverige/inga-spar-efter-forsvunnen-22-arig-student/
http://www.dn.se/sport/fotboll/forlust-for-rydstrom-i-tranardebuten/
http://www.dn.se/nyheter/sverige/manga-grasbrander-runt-om-i-landet/
http://www.dn.se/nyheter/sverige/tre-gripna-efter-skottlossning-i-malmo/
http://www.dn.se/sport/fotboll/elfsborg-ar-med-i-toppen-igen/
http://www.dn.se/sport/em-silver-till-rissveds/
which I assume is what you're looking for.
You can use ElementTree just fine, you just need to pass the source and use the xpath:
from xml.etree import ElementTree as et
import requests
tree = et.fromstring(requests.get("http://www.dn.se/rss/senaste-nytt/").content)
print([x.text for x in tree.findall(".//item//link")])
Output:
['http://www.dn.se/nyheter/varlden/andlig-ledare-ihjalhackad-i-bangladesh/', 'http://www.dn.se/nyheter/sverige/tillstandet-battre-for-pakord-ettaring/', 'http://www.dn.se/ekonomi/maria-crofts-dags-att-gora-nagot-at-orattvisa-pensioner/', 'http://www.dn.se/nyheter/varlden/turkisk-militar-dodade-55-is-krigare/', 'http://www.dn.se/nyheter/varlden/massiv-fiskdod-i-sjo/', 'http://www.dn.se/nyheter/varlden/kanadabranden-i-bilder/', 'http://www.dn.se/nyheter/sverige/manga-saknas-efter-jordskred-i-kina/', 'http://www.dn.se/nyheter/sverige/fortsatt-sokande-efter-student/', 'http://www.dn.se/nyheter/sverige/en-dod-i-villabrand-8/', 'http://www.dn.se/nyheter/politik/v-vill-ta-bort-terrorstampel-pa-pkk/', 'http://www.dn.se/ekonomi/raknehjalp-pa-natet-ger-ratt-underhall/', 'http://www.dn.se/nyheter/varlden/kanadabranden-fullstandigt-okontrollerad/', 'http://www.dn.se/nyheter/varlden/attentat-mot-journalister-besvarande-for-erdogan/', 'http://www.dn.se/nyheter/varlden/superlobbyist-ska-gora-trump-serios/', 'http://www.dn.se/nyheter/vetenskap/karin-bojs-en-typisk-foralder-ar-28-ar-gammal/', 'http://www.dn.se/sport/nervos-vantan-pa-em-biljetter/', 'http://www.dn.se/ekonomi/ovantat-stort-exportfall-i-kina/', 'http://www.dn.se/ekonomi/lott-gav-35-miljarder-i-vinst-i-usa/', 'http://www.dn.se/nyheter/vetenskap/fabels-kansliga-nos-ska-ge-svar-om-massmordet/', 'http://www.dn.se/sport/johan-esk-nu-borde-idrotten-lara-ledarskap-av-naringslivet-1/', 'http://www.dn.se/sport/melker-karlsson-malskytt-for-san-jose/', 'http://www.dn.se/sport/backstrom-visade-vagen-till-washingtons-viktiga-vinst/', 'http://www.dn.se/nyheter/varlden/15-miljoner-signaturer-bekraftade/', 'http://www.dn.se/nyheter/varlden/medan-du-sov-varlden-i-korthet-8-maj-1/', 'http://www.dn.se/nyheter/varlden/karnvapen-bara-om-landet-hotas/', 'http://www.dn.se/nyheter/varlden/protester-mot-att-avskaffa-senat/', 'http://www.dn.se/nyheter/sverige/industri-brann-i-uppsala/', 'http://www.dn.se/nyheter/varlden/atta-poliser-dodade-i-attack/', 'http://www.dn.se/nyheter/varlden/tva-fast-vid-myr-utanfor-kiruna/', 'http://www.dn.se/sport/hockeyhasten-nyquist-vann-kentucky-derby/', 'http://www.dn.se/nyheter/varlden/15-miljoner-signaturer-bekraftade-i-venezuela/']
Or using lxml which can also get the source for you:
from lxml import etree
result = etree.parse("http://www.dn.se/rss/senaste-nytt/")
print(result.xpath("//item//link//text()"))
Which gives you the exact same output.

Trying to parse xml from url store to string so I can use in another spot to output to irc

The following is the xml from remote URL
<SHOUTCASTSERVER>
<CURRENTLISTENERS>0</CURRENTLISTENERS>
<PEAKLISTENERS>0</PEAKLISTENERS>
<MAXLISTENERS>100</MAXLISTENERS>
<UNIQUELISTENERS>0</UNIQUELISTENERS>
<AVERAGETIME>0</AVERAGETIME>
<SERVERGENRE>variety</SERVERGENRE>
<SERVERGENRE2/>
<SERVERGENRE3/>
<SERVERGENRE4/>
<SERVERGENRE5/>
<SERVERURL>http://localhost/</SERVERURL>
<SERVERTITLE>Wicked Radio WIKD/WPOS</SERVERTITLE>
<SONGTITLE>Unknown - Haxor Radio Show 08</SONGTITLE>
<STREAMHITS>0</STREAMHITS>
<STREAMSTATUS>1</STREAMSTATUS>
<BACKUPSTATUS>0</BACKUPSTATUS>
<STREAMLISTED>0</STREAMLISTED>
<STREAMLISTEDERROR>200</STREAMLISTEDERROR>
<STREAMPATH>/stream</STREAMPATH>
<STREAMUPTIME>448632</STREAMUPTIME>
<BITRATE>128</BITRATE>
<CONTENT>audio/mpeg</CONTENT>
<VERSION>2.4.7.256 (posix(linux x64))</VERSION>
</SHOUTCASTSERVER>
All I am trying to do is store the contents of the element <SONGTITLE> store it so I can post to IRC using a bot that I have.
import urllib2
from lxml import etree
url = "http://142.4.217.133:9203/stats?sid=1&mode=viewxml&page=0"
fp = urllib2.urlopen(url)
doc = etree.parse(fp)
fp.close()
for record in doc.xpath('//SONGTITLE'):
for x in record.xpath("./subfield/text()"):
print "\t", x
That is what I have so far; not sure what I am doing wrong here. I am quite new to python but the IRC bot works and does some other utility type things I just want to add this as a feature to it.
You don't need to include ./subfield/:
for x in record.xpath("text()"):
Output:
Unknown - Haxor Radio Show 08

Categories