The following Regex gives me this output (note that I am using Python):
Which is perfect and exactly how I want it to be. However when I match this code in Python it works but I doesn't capture the next line of vlans when I use groupdict (talking about the second entry):
{'port_name': 'Te1/0/1', 'description': 'CVH10 Mgt+Clstr', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),161-163'}
{'port_name': 'Te1/0/2', 'description': 'CVH10 VM 1', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),11,101,110,'}
{'port_name': 'Fo2/1/1', 'description': None, 'duplex': 'N/A', 'speed': 'N/A', 'neg': 'N/A', 'link_state': 'Detach', 'flow_control': 'N/A', 'mode': None, 'vlans': None}
{'port_name': 'Te2/0/8', 'description': None, 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Down', 'flow_control': 'Off', 'mode': ' A', 'vlans': '1'}
As you can see in the Regex above the second entry matches 19 vlans, but the Python output only gives me 4. How can I fix this?
This is the code that I'm running:
from sys import argv
import re
import pprint
pp = pprint.PrettyPrinter()
script, filename = argv
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?')
local_list = []
def main():
with open(filename) as current_file:
for linenumber, line in enumerate(current_file, 1):
working_dict = {}
interface_details_result = interface_details.match(line)
if interface_details_result is not None:
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
for each in local_list:
print(each)
if __name__ == '__main__':
main()
Note that I'm using argv so it's runned as: python3 main.py test.txt
The data of the text file is listed below
>show interfaces status
Port Description Duplex Speed Neg Link Flow M VLAN
State Ctrl
--------- --------------- ------ ------- ---- ------ ----- -- -------------------
Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
Fo2/1/1 N/A N/A N/A Detach N/A
Te2/0/8 Full 10000 Off Down Off A 1
Currently you are reading separate lines, so the pattern will not match for the lines that have only this:
120,130,140,150,
What you could do is read the whole file instead using current_file.read() and add re.M enabling multiline.
In your code you are using this, which will first update the dict, and then append the working_dict resulting in n times the same (last) value as it points to the same dict.
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
If you want to gather all the groupdict's in a list, you can append it using local_list.append(m.groupdict())
import re
import pprint
pp = pprint.PrettyPrinter()
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?', re.M)
def main():
local_list = []
with open(filename) as current_file:
all_lines = re.finditer(interface_details, current_file.read())
for m in all_lines:
local_list.append(m.groupdict())
for each in local_list:
print(each)
if __name__ == '__main__':
main()
You are matching line by line.
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
The first line which is-
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
passes your regex expression.
But the following lines doesn't. For example the second line is-
120,130,140,150,
For this interface_details.match(" 120,130,140,150,") doesn't match the regex.
Continuing #anirudh's answer,
test_str will hold the entire string data read from file and regex will be your regex
finditer() will return an iterable of matches. re.MULTILINE param will enable pattern search on the entire multi-line string/data
regex = r"^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?"
test_str = ("Port Description Duplex Speed Neg Link Flow M VLAN\n"
" State Ctrl\n"
"--------- --------------- ------ ------- ---- ------ ----- -- -------------------\n"
"Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163\n"
"Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,\n"
" 120,130,140,150,\n"
" 160,170,180,190,\n"
" 200,210,230,240,\n"
" 250,666,999\n"
"Fo2/1/1 N/A N/A N/A Detach N/A\n"
"Te2/0/8 Full 10000 Off Down Off A 1")
for match in re.finditer(regex, test_str, re.MULTILINE):
print(match.groupdict())
This will get you the result you need. Above solution is a combination of this answer and the code generated from this site
Related
So basically I was trying to scrape a Reddit link about game of thrones. This is the link: https://www.reddit.com/r/gameofthrones/wiki/episode_discussion, this has many other links! What i was trying was to scrape all the links in a file which is done! Now i Have to individually scrape every link and print out the data in individual files either csv or json.
Ive tried all possible methods from google but still unable to come to a solution! Any help would be helpful
import praw
import json
import pandas as pd #Pandas for scraping and saving it as a csv
#This is PRAW.
reddit = praw.Reddit(client_id='',
client_secret='',
user_agent='android:com.example.myredditapp:v1.2.3 (by /u/AshKay12)',
username='******',
password='******')
subreddit=reddit.subreddit("gameofthrones")
Comments = []
submission = reddit.submission("links")
with open('got_reddit_links.json') as json_file:
data = json.load(json_file)
for p in data:
print('season: ' + str(p['season']))
print('episode: ' + str(p['episode']))
print('title: ' + str(p['title']))
print('links: ' + str(p['links']))
print('')
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
print(20*'#')
print('Parent ID:',comment.parent())
print('Comment ID:',comment.id)
print(comment.body)
Comments.append([comment.body, comment.id])
Comments = pd.DataFrame(Comments, columns=['All_Comments', 'Comment ID'])
Comments.to_csv('Reddit3.csv')
This code prints out the links, title and episode number. It also extracts data when the link is manually entered but there are over 50 links in the webiste so i extracted those and put it in a file.
You can find all episode blocks with the links, and then write a function to scrape the comments for each episode discovered by each link:
from selenium import webdriver
import requests, itertools, re
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://www.reddit.com/r/gameofthrones/wiki/episode_discussion')
new_d = soup(d.page_source, 'html.parser').find('div', {'class':'md wiki'}).find_all(re.compile('h2|h4|table'))
g = [(a, list(b)) for a, b in itertools.groupby(new_d, key=lambda x:x.name == 'h2')]
r = {g[i][-1][0].text:{g[i+1][-1][k].text:g[i+1][-1][k+1] for k in range(0, len(g[i+1][-1]), 2)} for i in range(0, len(g), 2)}
final_r = {a:{b:[j['href'] for j in c.find_all('a', {'href':re.compile('redd\.it')})] for b, c in k.items()} for a, k in r.items()}
Now, you have a dictionary with all the links structured according to Season and episode:
{'Season 1 Threads': {'1.01 Winter Is Coming': ['https://redd.it/gsd0t'], '1.02 The Kingsroad': ['https://redd.it/gwlcx'], '1.03 Lord Snow': ['https://redd.it/h1otp/'], '1.04 Cripples, Bastards, & Broken Things': ['https://redd.it/h70vv'].....
To get the comments, you have to use selenium as well to be able click on the button to display the entire comment structure:
import time
d = webdriver.Chrome('/path/to/chromedriver')
def scrape_comments(url):
d.get(url)
_b = [i for i in d.find_elements_by_tag_name('button') if 'VIEW ENTIRE DISCUSSION' in i.text][0]
_b.send_keys('\n')
time.sleep(1)
p_obj = soup(d.page_source, 'html.parser').find('div', {'class':'_1YCqQVO-9r-Up6QPB9H6_4 _1YCqQVO-9r-Up6QPB9H6_4'}).contents
p_obj = [i for i in p_obj if i != '\n']
c = [{'poster':'[deleted]' if i.a is None else i.a['href'], 'handle':getattr(i.find('div', {'class':'_2X6EB3ZhEeXCh1eIVA64XM _2hSecp_zkPm_s5ddV2htoj _zMIUk6t-WDI7fxfkvD02'}), 'text', 'N/A'), 'points':getattr(i.find('span', {'class':'_2ETuFsVzMBxiHia6HfJCTQ _3_GZIIN1xcMEC5AVuv4kfa'}), 'text', 'N/A'), 'time':getattr(i.find('a', {'class':'_1sA-1jNHouHDpgCp1fCQ_F'}), 'text', 'N/A'), 'comment':getattr(i.p, 'text', 'N/A')} for i in p_obj]
return c
Sample output when running scrape_comments on one of the urls:
[{'poster': '/user/BWPhoenix/', 'handle': 'N/A', 'points': 'Score hidden', 'time': '2 years ago', 'comment': 'Week one, so a couple of quick questions:'}, {'poster': '/user/No0neAtAll/', 'handle': 'N/A', 'points': '957 points', 'time': '2 years ago', 'comment': "Davos fans showing their love Dude doesn't say a word the entire episode and gives only 3 glances but still get's 548 votes."}, {'poster': '/user/MairmanChao/', 'handle': 'N/A', 'points': '421 points', 'time': '2 years ago', 'comment': 'Davos always gets votes for being the most honorable man in Westeros'}, {'poster': '/user/BourbonSlut/', 'handle': 'N/A', 'points': '47 points', 'time': '2 years ago', 'comment': 'I was hoping for some Tyrion dialogue too..'}.....
Now, putting it all together:
final_result = {a:{b:[scrape_comments(i) for i in c] for b, c in k.items()} for a, k in final_r.items()}
From here, you can now create a pd.DataFrame from final_result or write the results to the file.
Hello can anyone please help out with this?
this is the content of my txt file
DICT1 Assignment 1 25 100 nothing anyway at all
DICT2 Assignment 2 25 100 nothing at all
DICT3 Assignment 3 50 100 not at all
this is my code
from pathlib import Path
home = str(Path.home())
with open(home + "\\Desktop\\PADS Assignment\\DICT1 Assessment Task.txt", "r") as r:
for line in r:
print(line.strip().split())
my output of the code is
['DICT1', 'Assignment', '1', '25', '100', 'nothing']
['DICT2', 'Assignment', '2', '25', '100', 'nothing', 'at', 'all']
['DICT3', 'Assignment', '3', '50', '100', 'not', 'at', 'all']
Now my question is , how do i make the output to be
['DICT1', 'Assignment 1', '25', '100', 'nothing']
['DICT2', 'Assignment 2', '25', '100', 'nothing at all']
['DICT3', 'Assignment 3', '50', '100', 'not at all']
You could use the maxsplit parameter of the split method
line.split(maxsplit=5)
Of course if the format of the lines in your file is similar and you are using python 3.
For Python 2.x you should use
line.split(' ', 5)
Your main problem here is your input file, the separator in this file is a space but you also have some values with spaces to retrieve.
So you have two choices here:
You either change the input file to be comma separated values, i.e.:
DICT1, Assignment, 1, 25, 100, nothing anyway at all
DICT2, Assignment, 2, 25, 100, nothing at all
DICT3, Assignment, 3, 50, 100, not at all
You change your script to unpack manually the end of lines once you got every other items:
from pathlib import Path
home = str(Path.home())
with open(home + "\\Desktop\\PADS Assignment\\DICT1 Assessment Task.txt", "r") as r:
for line in r:
splittedLine = line.strip().split(" ")
taskId = splittedLine[0]
taskTitle = splittedLine[1]
weight = splittedLine[2]
fullMark = splittedLine[3]
description = " ".join(splittedLine[4:])
print("taskId: " + taskId + " - taskTitle: " + taskTitle + " - weight: " + weight + " -fullMark: " + fullMark + " - description: " + description)
This is the data:
C:/data/my_file.txt.c:10:0x21:name1:name2:0x10:1:OK
C:/data/my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK
./data/my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK
And I would like to get this result
[C:/data/my_file.txt.c, 10, 0x21, name1, name2, 0x10, 1, OK]
[C:/data/my_file2.txt.c, 110, 0x1, name2, name5, 0x12, 1, NOT_OK]
[./data/my_file3.txt.c, 110, 0x1, name2, name5, 0x12, 10, OK]
I know how to do that with some code or string split and stuff like that, but I am searching for a nice solution using pyparsing. My problem is the :/ for the file path.
Additional Question I use some code to strip comments and other stuff from the records so the raw data looks like this:
text = """C:/data/my_file.txt.c:10:0x21:name1:name2:0x10:1:OK
C:/data/my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK
// comment
./data/my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK
----
ok
"""
And i strip the "//", "ok", and "---" before parsing right now
So now I have a next question too the first:
Some addition to the first question. Till now I extracted the lines above from a data file - that works great. So I read the file line by line and parse it. But now I found out it is possible to use parseFile to parse a whole file. So I think I could strip some of my code and use parseFile instead. So the files I would like to parse have an additional footer.
C:/data/my_file.txt.c:10:0x21:name1:name2:0x10:1:OK
C:/data/my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK
./data/my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK: info message
-----------------------
3 Files 2 OK 1 NOT_OK
NOT_OK
Is it possible to change the parser to get 2 parse results?
Result1:
[['C:/data/my_file.txt.c', '10', '0x21', 'name1', 'name2', '0x10', '1', 'OK'],
['C:/data/my_file2.txt.c', '110', '0x1', 'name2', 'name5', '0x12', '1', 'NOT_OK'],
['./data/my_file3.txt.c', '110', '0x1', 'name2', 'name5', '0x12', '10', 'OK']]
Ignore the blank line
Ignore this line => -----------------------
Result 2:
[['3', 'Files', 2', 'OK’, '1', 'NOT_OK'],
['NOT_OK’],
So I changed the thes Code for that:
# define an expression for your file reference
one_thing = Combine(
oneOf(list(alphas)) + ':/' +
Word(alphanums + '_-./'))
# define a catchall expression for everything else (words of non-whitespace characters,
# excluding ':')
another_thing = Word(printables + " ", excludeChars=':')
# define an expression of the two; be sure to list the file reference first
thing = one_thing | another_thing
# now use plain old pyparsing delimitedList, with ':' delimiter
list_of_things = delimitedList(thing, delim=':')
list_of_other_things = Word(printables).setName('a')
# run it and see...
parse_ret = OneOrMore(Group(list_of_things | list_of_other_things)).parseFile("data.file")
parse_ret.pprint()
And I get this result:
[['C:/data/my_file.txt.c', '10', '0x21', 'name1', 'name2', '0x10', '1', 'OK'],
['C:/data/my_file2.txt.c','110', '0x1', 'name2', 'name5', '0x12', '1', 'NOT_OK'],
['./data/my_file3.txt.c', '110', '0x1', 'name2', 'name5', '0x12', '10', 'OK', 'info message'],
['-----------------------'],
['3 Files 2 OK 1 NOT_OK'],
['NOT_OK']]
So I can go with this but is it possible to split the result into two named results? I searched the docs but I didn´t find anything that works.
See embedded comments for pyparsing description:
from pyparsing import *
text = """C:/data/my_file.txt.c:10:0x21:name1:name2:0x10:1:OK
C:/data/my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK
// blah-de blah blah blah
./data/my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK"""
# define an expression for your file reference
one_thing = Combine(
oneOf(list(alphas.upper())) + ':/' +
Word(alphanums + '_-./'))
# define a catchall expression for everything else (words of non-whitespace characters,
# excluding ':')
another_thing = Word(printables, excludeChars=':')
# define an expression of the two; be sure to list the file reference first
thing = one_thing | another_thing
# now use plain old pyparsing delimitedList, with ':' delimiter
list_of_things = delimitedList(thing, delim=':')
parser = OneOrMore(Group(list_of_things))
# ignore comments starting with double slash
parser.ignore(dblSlashComment)
# run it and see...
parser.parseString(text).pprint()
prints:
[['C:/data/my_file.txt.c', '10', '0x21', 'name1', 'name2', '0x10', '1', 'OK'],
['C:/data/my_file2.txt.c', '110', '0x1', 'name2', 'name5', '0x12', '1', 'NOT_OK'],
['./data/my_file3.txt.c', '110', '0x1', 'name2', 'name5', '0x12', '10', 'OK']]
So I didn´t found a solution with delimitedList and parseFile but I found a Solution which is okay for me.
from pyparsing import *
data = """
C: / data / my_file.txt.c:10:0x21:name1:name2:0x10:1:OK
C: / data / my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK
./ data / my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK: info message
-----------------------
3 Files 2 OK 1 NOT_OK
NOT_OK
"""
if __name__ == '__main__':
# define an expression for your file reference
entry_one = Combine(
oneOf(list(alphas)) + ':/' +
Word(alphanums + '_-./'))
entry_two = Word(printables + ' ', excludeChars=':')
entry = entry_one | entry_two
delimiter = Literal(':').suppress()
tc_result_line = Group(entry.setResultsName('file_name') + delimiter + entry.setResultsName(
'line_nr') + delimiter + entry.setResultsName('num_one') + delimiter + entry.setResultsName('name_one') + delimiter + entry.setResultsName(
'name_two') + delimiter + entry.setResultsName('num_two') + delimiter + entry.setResultsName('status') + Optional(
delimiter + entry.setResultsName('msg'))).setResultsName("info_line")
EOL = LineEnd().suppress()
SOL = LineStart().suppress()
blank_line = SOL + EOL
tc_summary_line = Group(Word(nums).setResultsName("num_of_lines") + "Files" + Word(nums).setResultsName(
"num_of_ok") + "OK" + Word(nums).setResultsName("num_of_not_ok") + "NOT_OK").setResultsName(
"info_summary")
tc_end_line = Or(Literal("NOT_OK"), Literal('Ok')).setResultsName("info_result")
# run it and see...
pp1 = tc_result_line | Optional(tc_summary_line | tc_end_line)
pp1.ignore(blank_line | OneOrMore("-"))
result = list()
for l in data.split('\n'):
result.append((pp1.parseString(l)).asDict())
# delete empty results
result = filter(None, result)
for r in result:
print(r)
pass
Result:
{'info_line': {'file_name': 'C', 'num_one': '10', 'msg': '1', 'name_one': '0x21', 'line_nr': '/ data / my_file.txt.c', 'status': '0x10', 'num_two': 'name2', 'name_two': 'name1'}}
{'info_line': {'file_name': 'C', 'num_one': '110', 'msg': '1', 'name_one': '0x1', 'line_nr': '/ data / my_file2.txt.c', 'status': '0x12', 'num_two': 'name5', 'name_two': 'name2'}}
{'info_line': {'file_name': './ data / my_file3.txt.c', 'num_one': '0x1', 'msg': 'OK', 'name_one': 'name2', 'line_nr': '110', 'status': '10', 'num_two': '0x12', 'name_two': 'name5'}}
{'info_summary': {'num_of_lines': '3', 'num_of_ok': '2', 'num_of_not_ok': '1'}}
{'info_result': ['NOT_OK']}
Using re:
myList = ["C:/data/my_file.txt.c:10:0x21:name1:name2:0x10:1:OK", "C:/data/my_file2.txt.c:110:0x1:name2:name5:0x12:1:NOT_OK", "./data/my_file3.txt.c:110:0x1:name2:name5:0x12:10:OK"]
for i in myList:
newTxt = re.sub(r':', ",", i)
newTxt = re.sub(r',/', ":/", newTxt)
print newTxt
Using Python, I am attempting to extract data from the several "fields" of a Wikipedia Taxobox (an infobox which is usually displayed for each animal or plant species page, see for example here: https://en.wikipedia.org/wiki/Okapi).
The solution provided here (How to use Wikipedia API to get section of sidebar?) is interesting but not useful in my case, since I am interested in data from a lower taxonomic category (species).
What I want is a way (as pythonic as possible) to access every field in a Taxobox and then get the data (as a dictionary, perhaps) of interest.
Thanks in advance for any assistance.
EDIT: Here (https://github.com/siznax/wptools) is another good solution which should be what I need, but unfortunately it is a set of command line tools (besides dependent of other command line tools available only on Linux) and not a Python library.
EDIT2: wptools is a (python 2,3) library now.
#maurobio, #jimhark wptools is a python (2+3) library now. It will
give you any infobox with "box" in the name as a python dict, but
you probably want to use Wikidata (e.g. okapi
https://www.wikidata.org/wiki/Q82037) because infoboxen are messy (to
say the least). If you focus on Wikidata, then everyone benefits,
and wptools can get Wikidata for you too. We've recently updated wptools so that it gets ALL Wikidata by default.
You can get the infobox data in the example below in some languages, but as #biojl points out, wikitext has a different structure in different languages!
>>> page = wptools.page('Okapi')
>>> page.get_parse()
en.wikipedia.org (parse) Okapi
en.wikipedia.org (imageinfo) File:Okapi2.jpg
Okapi (en) data
{
image: <list(1)> {'kind': 'parse-image', u'descriptionshorturl':...
infobox: <dict(9)> status, status_ref, name, image, taxon, autho...
iwlinks: <list(4)> https://commons.wikimedia.org/wiki/Okapia_joh...
pageid: 22709
parsetree: <str(39115)> <root><template><title>about</title><par...
requests: <list(2)> parse, imageinfo
title: Okapi
wikibase: Q82037
wikidata_url: https://www.wikidata.org/wiki/Q82037
wikitext: <str(29930)> {{about|the animal}}{{good article}}{{use...
}
>>> page.data['infobox']
{'authority': '([[P.L. Sclater]], 1901)',
'image': 'Okapi2.jpg',
'image_caption': "An okapi at [[Disney's Animal Kingdom]] in [[Florida]].",
'name': 'Okapi',
'parent_authority': '[[Ray Lankester|Lankester]], 1901',
'status': 'EN',
'status_ref': '<ext><name>ref</name><attr> name=iucn</attr><inner>{{IUCN2008|assessor=IUCN SSC Antelope Specialist Group|year=2008|id=15188|title=Okapia johnstoni|downloaded=26 November 2013}} Database entry includes a brief justification of why this species is endangered.</inner><close></ref></close></ext>',
'status_system': 'IUCN3.1',
'taxon': 'Okapia johnstoni'}
However, because it is structured, you can get Wikidata in many languages, e.g.
>>> page = wptools.page('Okapi', lang='fr')
>>> page.get_wikidata()
www.wikidata.org (wikidata) Okapi
www.wikidata.org (labels) P646|P349|P373|P685|P627|Q16521|Q7432|Q...
fr.wikipedia.org (imageinfo) File:Okapia johnstoni -Marwell Wildl...
Okapi (fr) data
{
aliases: <list(2)> Mondonga, Okapia johnstoni
claims: <dict(26)> P646, P181, P935, P815, P373, P1417, P685, P1...
description: espèce de mammifères
image: <list(2)> {'kind': 'wikidata-image', u'descriptionshortur...
label: Okapi
labels: <dict(31)> P646, P373, P685, P627, Q16521, Q7432, Q20415...
modified: <dict(1)> wikidata
pageid: 84481
requests: <list(3)> wikidata, labels, imageinfo
title: Okapi
what: taxon
wikibase: Q82037
wikidata: <dict(26)> identifiant BioLib (P838), taxon supérieur ...
wikidata_url: https://www.wikidata.org/wiki/Q82037
}
>>> page.data['wikidata']
{u'carte de r\xe9partition (P181)': u'Okapi distribution.PNG',
u'cat\xe9gorie Commons (P373)': u'Okapia johnstoni',
u'dur\xe9e de gestation (P3063)': {u'amount': u'+14.5',
u'lowerBound': u'+14.0',
u'unit': u'http://www.wikidata.org/entity/Q5151',
u'upperBound': u'+15.0'},
u'd\xe9crit par (P1343)': u'encyclop\xe9die Otto (Q2041543)',
u'galerie Commons (P935)': u'Okapia johnstoni',
u'identifiant ARKive (P2833)': u'okapi/okapia-johnstoni',
u'identifiant Animal Diversity Web (P4024)': u'Okapia_johnstoni',
u'identifiant Biblioth\xe8que nationale de la Di\xe8te (P349)': u'01092792',
u'identifiant BioLib (P838)': u'33523',
u'identifiant Encyclopedia of Life (P830)': u'308387',
u'identifiant Encyclop\xe6dia Britannica en ligne (P1417)': u'animal/okapi',
u'identifiant Fossilworks (P842)': u'149380',
u'identifiant Freebase (P646)': u'/m/05pf4',
u'identifiant GBIF (P846)': u'2441207',
u'identifiant ITIS (P815)': u'625037',
u'identifiant Mammal Species of the World (P959)': u'14200484',
u'identifiant NCBI (P685)': u'86973',
u'identifiant UICN (P627)': u'15188',
u'identifiant de la Grande Encyclop\xe9die russe en ligne (P2924)': u'2290412',
u'image (P18)': [u'Okapia johnstoni -Marwell Wildlife, Hampshire, England-8a.jpg',
u'Okapia johnstoni1.jpg'],
u"nature de l'\xe9l\xe9ment (P31)": u'taxon (Q16521)',
u'nom scientifique du taxon (P225)': u'Okapia johnstoni',
u'nom vernaculaire (P1843)': [u'Okapi', u'Okapi'],
u'rang taxinomique (P105)': u'esp\xe8ce (Q7432)',
u'statut de conservation UICN (P141)': u'esp\xe8ce en danger (Q11394)',
u'taxon sup\xe9rieur (P171)': u'Okapia (Q1872039)'}
Don't forget that you can edit Wikidata in your own language. There are tools available to enable editing a large number of Wikidata pages.
EDIT: we've added a more general parser that should work (to some extent) with any infobox syntax, e.g.
>>> page = wptools.page('Okapi', lang='fr')
>>> page.get_parse()
fr.wikipedia.org (parse) Okapi
Okapi (fr) data
{
infobox: <dict(2)> count, boxes
...
}
>>> page.data['infobox']['count']
13
>>> page.data['infobox']['boxes']
[{u'Taxobox d\xe9but': [[{'index': '1'}, 'animal'],
[{'index': '2'}, "''Okapia johnstoni''"],
[{'index': '3'}, 'Okapi2.jpg'],
[{'index': '4'}, 'Okapi']]},
{'Taxobox': [[{'index': '1'}, 'embranchement'],
[{'index': '2'}, 'Chordata']]},
{'Taxobox': [[{'index': '1'}, 'classe'], [{'index': '2'}, 'Mammalia']]},
{'Taxobox': [[{'index': '1'}, 'sous-classe'], [{'index': '2'}, 'Theria']]},
{'Taxobox': [[{'index': '1'}, 'ordre'], [{'index': '2'}, 'Artiodactyla']]},
{'Taxobox': [[{'index': '1'}, 'famille'], [{'index': '2'}, 'Giraffidae']]},
{'Taxobox taxon': [[{'index': '1'}, 'animal'],
[{'index': '2'}, 'genre'],
[{'index': '3'}, 'Okapia'],
[{'index': '4'}, '[[Edwin Ray Lankester|Lankester]], [[1901]]']]},
{'Taxobox taxon': [[{'index': '1'}, 'animal'],
[{'index': '2'}, u'esp\xe8ce'],
[{'index': '3'}, 'Okapia johnstoni'],
[{'index': '4'}, '([[Philip Lutley Sclater|Sclater]], [[1901]])']]},
{'Taxobox synonymes': [[{'index': '1'},
"* ''Equus johnstoni'' <small>P.L. Sclater, 1901</small>"]]},
{'Taxobox UICN': [[{'index': '1'}, 'EN'], [{'index': '2'}, 'A2abcd+4abcd']]},
{u'Taxobox r\xe9partition': [[{'index': '1'}, 'Okapi map.jpg']]},
{u'Taxobox r\xe9partition': [[{'index': '1'}, 'Okapi distribution.PNG']]},
{'Taxobox fin': []}]
Hope that helps.
{#siznax has posted a better answer. I'm only leaving my answer here as an example of using the wiki api's and parsing the results. This would only be of practical use if a library like wptools couldn't meet your needs for some reason.}
This is a significant rewrite that includes a (more) proper parser to match the template's closing double braces '}}'. Also makes it easier to request different template names and includes a main() to allow testing from the shell / command line.
import sys
import re
import requests
import json
wikiApiRoot = 'https://en.wikipedia.org/w/api.php'
# returns the position past the requested token or end of string if not found
def FindToken(text, token, start=0):
pos = text.find(token, start)
if -1 == pos:
nextTokenPos = len(text)
else:
nextTokenPos = pos
return nextTokenPos + len(token)
# Get the contents of the template as text
def GetTemplateText(wikitext, templateName):
templateTag = '{{' + templateName
startPos = FindToken(wikitext, templateTag)
if (len(wikitext) <= startPos):
# Template not found
return None
openCount = 1
curPos = startPos
nextOpenPos = FindToken(wikitext, '{{', curPos)
nextClosePos = FindToken(wikitext, '}}', curPos)
# scan for template's matching close braces
while 0 < openCount:
if nextOpenPos < nextClosePos:
openCount += 1
curPos = nextOpenPos
nextOpenPos = FindToken(wikitext, '{{', curPos)
else:
openCount -= 1
curPos = nextClosePos
nextClosePos = FindToken(wikitext, '}}', curPos)
templateText = wikitext[startPos:curPos-2]
return templateText
def GetTemplateDict(title, templateName='Taxobox'):
templateDict = None
# Get data from Wikipedia:
resp = requests.get(wikiApiRoot + '?action=query&prop=revisions&' +
'rvprop=content&rvsection=0&format=json&redirects&titles=' +
title)
# Get the response text into a JSON object:
rjson = json.loads(resp.text)
# Pull out the text for the revision:
wikitext = rjson['query']['pages'].values()[0]['revisions'][0]['*']
# Parse the text for the template
templateText = GetTemplateText(wikitext, templateName)
if templateText:
# Parse templateText to get named properties
templateItemIter = re.finditer(
r'\|\s*(\w*)\s*=\s*([^\n]*)\n',
templateText,
re.M)
templateList = [item.groups([0,1]) for item in templateItemIter]
templateDict = dict(templateList)
return templateDict
def main():
import argparse
import pprint
parser = argparse.ArgumentParser()
parser.add_argument('title', nargs='?', default='Okapia_johnstoni', help='title of the desired article')
parser.add_argument('template', nargs='?', default='Taxobox', help='name of the desired template')
args = parser.parse_args()
templateDict = GetTemplateDict(args.title, args.template)
pprint.pprint(templateDict)
if __name__ == "__main__":
main()
GetTemplateDict returns a dictionary of the page's taxobox entries. For the Okapi page, this includes:
binomial
binomial_authority
classis
familia
genus
genus_authority
image
image_caption
ordo
phylum
regnum
species
status
status_ref
status_system
trend
I expect the actual items to vary by page.
The dictionary values are Wikipedia's decorated text:
>>> taxoDict['familia']
'[[Giraffidae]]'
So additional parsing or filtering may be desired or required.
I am experiencing a strange faulty behaviour, where a dictionary is only appended once and I can not add more key value pairs to it.
My code reads in a multi-line string and extracts substrings via split(), to be added to a dictionary. I make use of conditional statements. Strangely only the key:value pairs under the first conditional statement are added.
Therefore I can not complete the dictionary.
How can I solve this issue?
Minimal code:
#I hope the '\n' is sufficient or use '\r\n'
example = "Name: Bugs Bunny\nDOB: 01/04/1900\nAddress: 111 Jokes Drive, Hollywood Hills, CA 11111, United States"
def format(data):
dic = {}
for line in data.splitlines():
#print('Line:', line)
if ':' in line:
info = line.split(': ', 1)[1].rstrip() #does not work with files
#print('Info: ', info)
if ' Name:' in info: #middle name problems! /maiden name
dic['F_NAME'] = info.split(' ', 1)[0].rstrip()
dic['L_NAME'] = info.split(' ', 1)[1].rstrip()
elif 'DOB' in info: #overhang
dic['DD'] = info.split('/', 2)[0].rstrip()
dic['MM'] = info.split('/', 2)[1].rstrip()
dic['YY'] = info.split('/', 2)[2].rstrip()
elif 'Address' in info:
dic['STREET'] = info.split(', ', 2)[0].rstrip()
dic['CITY'] = info.split(', ', 2)[1].rstrip()
dic['ZIP'] = info.split(', ', 2)[2].rstrip()
return dic
if __name__ == '__main__':
x = format(example)
for v, k in x.iteritems():
print v, k
Your code doesn't work, at all. You split off the name before the colon and discard it, looking only at the value after the colon, stored in info. That value never contains the names you are looking for; Name, DOB and Address all are part of the line before the :.
Python lets you assign to multiple names at once; make use of this when splitting:
def format(data):
dic = {}
for line in data.splitlines():
if ':' not in line:
continue
name, _, value = line.partition(':')
name = name.strip()
if name == 'Name':
dic['F_NAME'], dic['L_NAME'] = value.split(None, 1) # strips whitespace for us
elif name == 'DOB':
dic['DD'], dic['MM'], dic['YY'] = (v.strip() for v in value.split('/', 2))
elif name == 'Address':
dic['STREET'], dic['CITY'], dic['ZIP'] = (v.strip() for v in value.split(', ', 2))
return dic
I used str.partition() here rather than limit str.split() to just one split; it is slightly faster that way.
For your sample input this produces:
>>> format(example)
{'CITY': 'Hollywood Hills', 'ZIP': 'CA 11111, United States', 'L_NAME': 'Bunny', 'F_NAME': 'Bugs', 'YY': '1900', 'MM': '04', 'STREET': '111 Jokes Drive', 'DD': '01'}
>>> from pprint import pprint
>>> pprint(format(example))
{'CITY': 'Hollywood Hills',
'DD': '01',
'F_NAME': 'Bugs',
'L_NAME': 'Bunny',
'MM': '04',
'STREET': '111 Jokes Drive',
'YY': '1900',
'ZIP': 'CA 11111, United States'}