The following Regex gives me this output (note that I am using Python):
Which is perfect and exactly how I want it to be. However when I match this code in Python it works but I doesn't capture the next line of vlans when I use groupdict (talking about the second entry):
{'port_name': 'Te1/0/1', 'description': 'CVH10 Mgt+Clstr', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),161-163'}
{'port_name': 'Te1/0/2', 'description': 'CVH10 VM 1', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),11,101,110,'}
{'port_name': 'Fo2/1/1', 'description': None, 'duplex': 'N/A', 'speed': 'N/A', 'neg': 'N/A', 'link_state': 'Detach', 'flow_control': 'N/A', 'mode': None, 'vlans': None}
{'port_name': 'Te2/0/8', 'description': None, 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Down', 'flow_control': 'Off', 'mode': ' A', 'vlans': '1'}
As you can see in the Regex above the second entry matches 19 vlans, but the Python output only gives me 4. How can I fix this?
This is the code that I'm running:
from sys import argv
import re
import pprint
pp = pprint.PrettyPrinter()
script, filename = argv
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?')
local_list = []
def main():
with open(filename) as current_file:
for linenumber, line in enumerate(current_file, 1):
working_dict = {}
interface_details_result = interface_details.match(line)
if interface_details_result is not None:
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
for each in local_list:
print(each)
if __name__ == '__main__':
main()
Note that I'm using argv so it's runned as: python3 main.py test.txt
The data of the text file is listed below
>show interfaces status
Port Description Duplex Speed Neg Link Flow M VLAN
State Ctrl
--------- --------------- ------ ------- ---- ------ ----- -- -------------------
Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
Fo2/1/1 N/A N/A N/A Detach N/A
Te2/0/8 Full 10000 Off Down Off A 1
Currently you are reading separate lines, so the pattern will not match for the lines that have only this:
120,130,140,150,
What you could do is read the whole file instead using current_file.read() and add re.M enabling multiline.
In your code you are using this, which will first update the dict, and then append the working_dict resulting in n times the same (last) value as it points to the same dict.
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
If you want to gather all the groupdict's in a list, you can append it using local_list.append(m.groupdict())
import re
import pprint
pp = pprint.PrettyPrinter()
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?', re.M)
def main():
local_list = []
with open(filename) as current_file:
all_lines = re.finditer(interface_details, current_file.read())
for m in all_lines:
local_list.append(m.groupdict())
for each in local_list:
print(each)
if __name__ == '__main__':
main()
You are matching line by line.
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
The first line which is-
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
passes your regex expression.
But the following lines doesn't. For example the second line is-
120,130,140,150,
For this interface_details.match(" 120,130,140,150,") doesn't match the regex.
Continuing #anirudh's answer,
test_str will hold the entire string data read from file and regex will be your regex
finditer() will return an iterable of matches. re.MULTILINE param will enable pattern search on the entire multi-line string/data
regex = r"^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?"
test_str = ("Port Description Duplex Speed Neg Link Flow M VLAN\n"
" State Ctrl\n"
"--------- --------------- ------ ------- ---- ------ ----- -- -------------------\n"
"Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163\n"
"Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,\n"
" 120,130,140,150,\n"
" 160,170,180,190,\n"
" 200,210,230,240,\n"
" 250,666,999\n"
"Fo2/1/1 N/A N/A N/A Detach N/A\n"
"Te2/0/8 Full 10000 Off Down Off A 1")
for match in re.finditer(regex, test_str, re.MULTILINE):
print(match.groupdict())
This will get you the result you need. Above solution is a combination of this answer and the code generated from this site
Im using bs4 for Python, I want to get a json from a web page but its like this:
<script>
vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});
</script>
Using beautifulsoup for python, but there is no class to identify
Thank you
You can simply use the 'script' tag to find the element:
soup = BeautifulSoup('''<script>vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});</script>''', 'html.parser')
js_code = soup.find('script').contents[0]
js_code is then
vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});
The tricky sketchy part is getting the json from it.
I will rarely root for regex for this kind of tasks, but this is a rare one.
import re
...
js_code = soup.find('script').contents[0]
print(re.search('{.*}', js_code).group(0))
This outputs
{"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"}
Which can be converted to a Python dict using json.loads.
import json
...
print(json.loads(re.search('{.*}', js_code).group(0)))
Outputs
{'pageCategory': 'Product', 'pageDepartment': 'Calzado', 'pageUrl': 'http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p', 'pageTitle': 'AIR FORCE 1 07 LV8 | MASCULINO - tafmx', 'skuStockOutFromShelf': [], 'skuStockOutFromProductDetail': ['23312', '23313', '23314', '23316', '23325', '23326', '23327', '23328'], 'shelfProductIds': ['140', '141', '142', '3775', '3777', '3782', '3785', '545', '17', '314', '318', '530', '645', '801', '822', '940'], 'accountName': 'tafmx', 'pageFacets': [], 'productId': '3829', 'productReferenceId': 'CU8070-100', 'productEans': ['194502172393', '194502172409', '194502172416', '194502172423', '194502172430', '194502172447', '194502172454', '194502172461', '194502172478', '194502172485', '194502172492', '194502172508', '194502172515', '194502172522', '194502172539', '194502172546', '194502172553'], 'skuStocks': {'23312': 0, '23313': 0, '23314': 0, '23315': 11, '23316': 0, '23317': 19, '23318': 29, '23319': 22, '23320': 12, '23321': 7, '23322': 9, '23323': 15, '23324': 14, '23325': 0, '23326': 0, '23327': 0, '23328': 0}, 'productName': 'AIR FORCE 1 07 LV8', 'productBrandId': 2000004, 'productBrandName': 'Nike', 'productDepartmentId': 7, 'productDepartmentName': 'Calzado', 'productCategoryId': 8, 'productCategoryName': 'Sneakers', 'productListPriceFrom': '2199', 'productListPriceTo': '2199', 'productPriceFrom': '2199', 'productPriceTo': '2199', 'sellerId': '1', 'sellerIds': '1'}
Note that you may need to use a more complex regex if the script tag contains other things you did not show in the question.
I want to fetch some data from a certain web page (online book store).
The web pages look like this: http://www.rob389.com/dp/tr/11/9789754681383
I want to parse the <script> section that contains the following string:
tOBJ.DATA[0].MMM00_ITEM_CODE="9789754681383"
Where 9789754681383 is the ISBN number.
I first need to find and extract the correct <script> section.
Then, I want to iterate through the content of the tOBJ.DATA[0] object and convert it into a dictionary which would look like:
my_dict["MMM00_ITEM_CODE"] = "9789754681383"
my_dict["MMM00_TITLE"] = "Uykusuz Bir Gece"
And so on...
I guess it's possible to do it with python with few number of lines, but I cannot figure out.
Any suggestions?
Thanks in advance.
PS: Any other parsing suggestions are welcome.
I would almost always recommend the use of BeautifulSoup - however, this page seems to have a '<!-->' tag half-way down which kills the parser. So, re to the rescue...
import urllib
import re
url = 'http://www.rob389.com/dp/tr/11/9789754681383'
txt = urllib.urlopen(url).read()
pat = re.compile(r'tOBJ\.DATA\[0\]\.(\w+)\s*=\s*([^;]*);', flags=re.M|re.DOTALL)
my_dict = dict((k,v.strip('"')) for k,v in pat.findall(txt))
which results in a 132-entry dictionary including
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
If you want to restrict it to only keys beginning with 'MMM00', try
my_dict = dict((k,v.strip('"')) for k,v in pat.findall(txt) if k.startswith('MMM00'))
which only returns 15 items.
You can do it with BeautifulSoup and a little bit of code:
from bs4 import BeautifulSoup
import urllib2
from urlparse import urlsplit
def isbnExtractor(url):
urlContent = url.strip('/').split('/')
print urlContent[6]
And this should do the trick. Like this:
PS python
Python 2.7.2 (default, Jun 12 2011, 15:08:59) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from parse import isbnExtractor
>>> url = "http://www.rob389.com/dp/tr/11/9789754681383"
>>> isbnExtractor(url)
9789754681383
The page is so horribly invalid XML that all the normal approaches using parse() from lxml.tree followed by xpath etc fail miserably. So looks like your best bet is something like:
>>> import re
>>> import urllib
>>> import pprint
>>> s = urllib.urlopen("http://www.rob389.com/dp/tr/11/9789754681383").read()
>>> magic = re.compile(r'tOBJ.DATA\[0\].([A-Z0-9_]+)="([^"]+)"')
>>> my_dict = dict(magic.findall(s))
>>> pprint.pprint(my_dict)
{'DISC_PERC': '15.0000000000',
'EXCHANGE_RT': '2.2815',
'LNK_PREFIX': 'uykusuz-bir-gece-jill-murphy',
'LST_PRICE': '7.500000000000000',
'LST_YAX02_CODE': 'YTL',
'MMG00_CODE': '11',
'MMG00_TITLE': 'Kitap',
'MMM00_DESC': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DESC250': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DISC_PERC_SAL': '25',
'MMM00_HEIGHT': '19.6',
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
'MMM00_ZHEIGHT': '1',
'MMS03_PRICE_1': '7.500000000000000',
'MMS03_PRICE_2': '0.000000000000000',
'MMS03_PRICE_3': '7.500000000000000',
'MMS03_YAX02_CODE_1': 'YTL',
'MMS03_YAX02_CODE_2': 'YTL',
'MMS03_YAX02_CODE_3': 'YTL',
'NWS01_DESC': "<BR>New Orleans'da do\xf0an Lillian Hellman'\xfdn ilkgen\xe7li\xf0i daha sonra oyunlar\xfdnda \xfcst\xfc kapal\xfd olarak yer bulacak olan tuhaf ve h\xfdrsl\xfd akrabalar aras\xfdnda ge\xe7ti. New Orleans ve New York aras\xfdnda mekik dokuyarak ge\xe7en y\xfdllarda etraf\xfdndaki farkl\xfd k\xfclt\xfcrleri g\xf6zlemleme \xfeans\xfd buldu. Liseyi bitirdikten sonra Columbia ve New York \xdcniversitesi'ne devam ettiyse de, e\xf0itimini yar\xfdda b\xfdrakarak bir yay\xfdnevinde \xe7al\xfd\xfemaya ba\xfelad\xfd. 1920'lerin bohem hayat\xfdna g\xf6z k\xfdrpt\xfd\xf0\xfd bu d\xf6nemde tan\xfd\xfet\xfd\xf0\xfd gen\xe7 yazar Arthur Kober ile evlenerek Hollywood'a ta\xfe\xfdnd\xfd. <BR><BR>1930'lar\xfdn ba\xfe\xfdnda MGM'de d\xfczeltmenlik yapt\xfd. Hevesli bir solcu oldu\xf0u bu y\xfdllarda, i\xfe arkada\xfelar\xfdn\xfd sendikala\xfemalar\xfd i\xe7in<A class=A2 href=\\",
'NWS01_DESC400': '<A class=A3 href=\\',
'NWS01_ID': '588',
'NWS01_IMAGE': '/UD_OBJS/IMAGES/NWS/HSTTR/Hellman_L_231204_365_1.jpg',
'ON_ESHOP': 'T',
'PEP01_ID': '229016',
'PEP01_NAME': 'Jill Murphy',
'PRD_FNM01_ID': '23462',
'PRD_FNM01_TITLE': 'Mandolin',
'PRD_FNM01_TRD_TITLE': 'Say Yay\xfdnlar\xfd',
'PUR_VAT_VALUE': '8',
'SAL_PRICE': '6.3750000000',
'SAL_VAT_VALUE': '8',
'SAL_YAX02_CODE': 'YTL',
'UD_10': '~410~|',
'UD_10_VAL': 'T\xfcrk\xe7e',
'UD_11': '~1000~|~803.2~|',
'UD_11_VAL': '\xc7ocuk,\xd6yk\xfc',
'UD_12': '~1000.4080~|',
'UD_12_VAL': '\xc7ocuk | 07-12 Ya\xfe | Edebiyat',
'UD_15': '978-975-468-138-3',
'UD_15_VAL': '978-975-468-138-3',
'UD_16': '~PB~|',
'UD_16_VAL': 'Karton Kapak',
'UD_19': '01/01/2010',
'UD_19_VAL': '01/01/2004',
'UD_2': 'The Worst Witch Strikes Again',
'UD_20': '92',
'UD_20_VAL': '92',
'UD_21': '52',
'UD_21_VAL': '52',
'UD_22': '3',
'UD_22_VAL': '3',
'UD_23': '1',
'UD_23_VAL': '1',
'UD_24': '~HM1~|',
'UD_24_VAL': '1. Hamur',
'UD_26': '7-12',
'UD_26_VAL': '07-12',
'UD_2_VAL': 'The Worst Witch Strikes Again',
'UD_3': '~229016~|',
'UD_30': '1',
'UD_30_VAL': '1',
'UD_31': '1',
'UD_31_VAL': '1',
'UD_34': '~1~|',
'UD_34_VAL': '1-3 G\xfcn',
'UD_36': '1',
'UD_36_VAL': '1',
'UD_39': 'VAR',
'UD_39_VAL': 'Var',
'UD_3_VAL': 'Jill Murphy',
'UD_42': '~410~|',
'UD_42_VAL': 'T\xfcrk\xe7e',
'UD_6': '~239986~|',
'UD_6_VAL': 'Seza Sunar',
'YAX02_CODE': 'EUR'}
>>>