Python regex to get the date from different combinations - python

I have a text file having multiple lines one of the line contains field description, and that field has multiple combination or notation of dates surrounded by other strings like colasas|04/18/2017|NXP , FTP Permanent|09|10|2012|FTP, and Project|16 July 2005|Design. from which I want to parse the dates only, One way I found is to use dateutil module which looks to be complicated and lot of manipulation for this purpose.
So, while going through the examples test , it works for certain combinations..
>>> from dateutil.parser import parse
>>> test_cases = ['04/30/2009', '06/20/95', '8/2/69', '1/25/2011', '9/3/2002', '4-13-82', 'Mar-02-2009', 'Jan 20, 1974',
... 'March 20, 1990', 'Dec. 21, 2001', 'May 25 2009', '01 Mar 2002', '2 April 2003', '20 Aug. 2004',
... '20 November, 1993', 'Aug 10th, 1994', 'Sept 1st, 2005', 'Feb. 22nd, 1988', 'Sept 2002', 'Sep 2002',
... 'December, 1998', 'Oct. 2000', '6/2008', '12/2001', '1998', '2002']
>>> for date_string in test_cases:
... print(date_string, parse(date_string).strftime("%Y%m%d"))
...
04/30/2009 20090430
06/20/95 19950620
8/2/69 19690802
----- etc --------
However, I have the below data combination which I need to parse but while opting for above solution it fails to get the results..
As description is optional as it may be missing at certain point so , I considered using (?:description:* (.*))? .
description: colasas|04/18/2017|NXP
description: colasas|04/18/2017|NXP
description: Remedy Tkt 01212152 Orcad move
description: FTP Permanent|09|10|2012|FTP
description: Remedy Tkt 01212152 Orcad move
description: TDA Drop12 Account|July 2004|TDA Drop12 Account
description: ftp|121210|ftp
description: Design Foundry Project|16 July 2005|Design Foundry Project
description: FTP Permanent|10/10/2010|FTP
description: WFS-JP|7-31-05|WFS-JP
description: FTP Permanent|10|11|2010|FTP
I have re-formated the Question Just allow to more visibility to get more inputs.
Below is the actula script which is having three diffrent matches dn , ftpuser and the last description which i'm looking for the solution.
Below script is working for all the matches but the last feild which is description having the mixed and raw data from which i need the dates only
and the dates are encapsulated between PIPES"|".
#!/usr/bin/python3
# ./dataparse.py
from __future__ import print_function
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
import re
with open('test2', 'r') as f:
for line in f:
line = line.strip()
data = f.read()
regex = (r"dn:(.*?)\nftpuser: (.*)\ndescription:* (.*)")
matchObj = re.findall(regex, data)
for index in matchObj:
#print(index)
index_str = ' '.join(index)
new_str = re.sub(r'[=,]', ' ', index_str)
new_str = new_str.split()
print("{0:<30}{1:<20}{2:<50}".format(new_str[1],new_str[8],new_str[9]))
Resulted output:
$ ./dataparse.py
ab02 disabled_5Mar07 Remedy
mela Y ROYALS|none|customer
ab01 Y VGVzdGluZyA
tt#regg.com T REG-JP|7-31-05|REG-JP

The parse method you're using accepts a keyword argument to allow ignoring irrelevant parts of the string.
:param fuzzy:
Whether to allow fuzzy parsing, allowing for string like "Today is
January 1, 2047 at 8:21:00AM".
Demo:
>>> parse('colasas|04/18/2017|NXP', fuzzy=True)
datetime.datetime(2017, 4, 18, 0, 0)
There is another one to also return tuples including the parts of the string that were ignored:
>>> parse('colasas|04/18/2017|NXP', fuzzy_with_tokens=True)
(datetime.datetime(2017, 4, 18, 0, 0), ('colasas|', '|NXP'))
This method won't work perfectly with all of your input strings, but it should get you most of the way there. You may have to do some pre-processing for the stranger ones.

Using some string manipulation
Demo:
s = """description: colasas|04/18/2017|NXP
description: colasas|04/18/2017|NXP
description: Remedy Tkt 01212152 Orcad move
description: FTP Permanent|09|10|2012|FTP
description: Remedy Tkt 01212152 Orcad move
description: TDA Drop12 Account|July 2004|TDA Drop12 Account
description: ftp|121210|ftp
description: Design Foundry Project|16 July 2005|Design Foundry Project
description: FTP Permanent|10/10/2010|FTP
description: WFS-JP|7-31-05|WFS-JP
description: FTP Permanent|10|11|2010|FTP"""
from dateutil.parser import parse
for i in s.split("\n"):
val = i.split("|", 1) #Split by first "|"
if len(val) > 1: #Check if Date in string.
val = val[1].rpartition("|")[0] #Split by right "|"
print( parse(val, fuzzy=True) )
Output:
2017-04-18 00:00:00
2017-04-18 00:00:00
2012-07-03 00:00:00
2004-07-03 00:00:00
2010-12-12 00:00:00
2005-07-16 00:00:00
2010-10-10 00:00:00
2005-07-31 00:00:00
2010-07-03 00:00:00
Regarding your datetime error remove from datetime import datetime
Demo:
import re
import datetime
strh = "description: colasas|04/18/2017|NXP"
match = re.search(r'\d{2}/\d{2}/\d{4}', strh)
date = datetime.datetime.strptime(match.group(), '%m/%d/%Y').date()
print(date)

text="""
description: colasas|04/18/2017|NXP
description: colasas|04/18/2017|NXP
description: Remedy Tkt 01212152 Orcad move
description: FTP Permanent|09|10|2012|FTP
description: Remedy Tkt 01212152 Orcad move
description: TDA Drop12 Account|July 2004|TDA Drop12 Account
description: ftp|121210|ftp
description: Design Foundry Project|16 July 2005|Design Foundry Project
description: FTP Permanent|10/10/2010|FTP
description: WFS-JP|7-31-05|WFS-JP
description: FTP Permanent|10|11|2010|FTP
"""
import re
reg=re.compile(r"(?ms)\|(\d\d)(\d\d)(\d\d)\||\|(\d{1,2})[\|/\-](\d{1,2})[\|/\-](\d{2,4})\||\|(\d*)\s*(\w+)\s*(\d{4})\|")
dates= [ t[:3] if t[1] else t[3:6] if t[4] else t[6:] for t in reg.findall(text) ]
print(dates)
"""
regexp for |121210| ---> \|(\d\d)(\d\d)(\d\d)\|
for |16 July 2005| ---> \|(\d*)\s*(\w+)\s*(\d{4})\|
for the others ---> \|(\d{1,2})[\|/\-](\d{1,2})[\|/\-](\d{2,4})\|
"""
Output: [('04', '18', '2017'), ('04', '18', '2017'), ('09', '10', '2012'), ('', 'July', '2004'), ('12', '12', '10'), ('16', 'July', '2005'), ('10', '10', '2010'), ('7', '31', '05'), ('10', '11', '2010')]
Get the date as it is:
reg=re.compile(r"(?ms)\|(\d{6})\||\|(\d{1,2}[\|/\-]\d{1,2}[\|/\-]\d{2,4})\||\|(\d*\s*\w+\s+\d{4})\|")
dates= [ t[0] or t[1] or t[2] for t in reg.findall(text) ]
print(dates)
Output:
['04/18/2017', '04/18/2017', '09|10|2012', 'July 2004', '121210', '16 July 2005', '10/10/2010', '7-31-05', '10|11|2010']

I achieved it through regex considering the values between pipes as follows:
"(?:description:* .*\|([0-9]{1,2}[-/]+[0-9]{1,2}[-/]+[0-9]{2,4})\|.*)?"

Related

Matching Regex on new line Python

The following Regex gives me this output (note that I am using Python):
Which is perfect and exactly how I want it to be. However when I match this code in Python it works but I doesn't capture the next line of vlans when I use groupdict (talking about the second entry):
{'port_name': 'Te1/0/1', 'description': 'CVH10 Mgt+Clstr', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),161-163'}
{'port_name': 'Te1/0/2', 'description': 'CVH10 VM 1', 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Up', 'flow_control': 'On', 'mode': ' T', 'vlans': '(1),11,101,110,'}
{'port_name': 'Fo2/1/1', 'description': None, 'duplex': 'N/A', 'speed': 'N/A', 'neg': 'N/A', 'link_state': 'Detach', 'flow_control': 'N/A', 'mode': None, 'vlans': None}
{'port_name': 'Te2/0/8', 'description': None, 'duplex': 'Full', 'speed': '10000', 'neg': 'Off', 'link_state': 'Down', 'flow_control': 'Off', 'mode': ' A', 'vlans': '1'}
As you can see in the Regex above the second entry matches 19 vlans, but the Python output only gives me 4. How can I fix this?
This is the code that I'm running:
from sys import argv
import re
import pprint
pp = pprint.PrettyPrinter()
script, filename = argv
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?')
local_list = []
def main():
with open(filename) as current_file:
for linenumber, line in enumerate(current_file, 1):
working_dict = {}
interface_details_result = interface_details.match(line)
if interface_details_result is not None:
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
for each in local_list:
print(each)
if __name__ == '__main__':
main()
Note that I'm using argv so it's runned as: python3 main.py test.txt
The data of the text file is listed below
>show interfaces status
Port Description Duplex Speed Neg Link Flow M VLAN
State Ctrl
--------- --------------- ------ ------- ---- ------ ----- -- -------------------
Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
Fo2/1/1 N/A N/A N/A Detach N/A
Te2/0/8 Full 10000 Off Down Off A 1
Currently you are reading separate lines, so the pattern will not match for the lines that have only this:
120,130,140,150,
What you could do is read the whole file instead using current_file.read() and add re.M enabling multiline.
In your code you are using this, which will first update the dict, and then append the working_dict resulting in n times the same (last) value as it points to the same dict.
working_dict.update(interface_details_result.groupdict())
local_list.append(working_dict)
If you want to gather all the groupdict's in a list, you can append it using local_list.append(m.groupdict())
import re
import pprint
pp = pprint.PrettyPrinter()
interface_details = re.compile(r'^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?', re.M)
def main():
local_list = []
with open(filename) as current_file:
all_lines = re.finditer(interface_details, current_file.read())
for m in all_lines:
local_list.append(m.groupdict())
for each in local_list:
print(each)
if __name__ == '__main__':
main()
You are matching line by line.
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
120,130,140,150,
160,170,180,190,
200,210,230,240,
250,666,999
The first line which is-
Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,
passes your regex expression.
But the following lines doesn't. For example the second line is-
120,130,140,150,
For this interface_details.match(" 120,130,140,150,") doesn't match the regex.
Continuing #anirudh's answer,
test_str will hold the entire string data read from file and regex will be your regex
finditer() will return an iterable of matches. re.MULTILINE param will enable pattern search on the entire multi-line string/data
regex = r"^(?P<port_name>[\w\/]+)[^\S\r\n]+(?P<description>(?!Full\b|N\/A\b)\S+(?:[^\S\r\n]+\S+)*?)?\s+(?P<duplex>Full|N\/A)\b\s+(?P<speed>[\d\w\/]+)\s+(?P<neg>[\w\/]+)\s+(?P<link_state>[\w]+)\s+(?P<flow_control>[\w\/]+)(?:(?P<mode>[^\S\r\n]+\w+)(?:[^\S\r\n]+(?P<vlans>[\d(),-]+(?:\r?\n[^\S\r\n]+[\d(),-]+)*))?)?"
test_str = ("Port Description Duplex Speed Neg Link Flow M VLAN\n"
" State Ctrl\n"
"--------- --------------- ------ ------- ---- ------ ----- -- -------------------\n"
"Te1/0/1 CVH10 Mgt+Clstr Full 10000 Off Up On T (1),161-163\n"
"Te1/0/2 CVH10 VM 1 Full 10000 Off Up On T (1),11,101,110,\n"
" 120,130,140,150,\n"
" 160,170,180,190,\n"
" 200,210,230,240,\n"
" 250,666,999\n"
"Fo2/1/1 N/A N/A N/A Detach N/A\n"
"Te2/0/8 Full 10000 Off Down Off A 1")
for match in re.finditer(regex, test_str, re.MULTILINE):
print(match.groupdict())
This will get you the result you need. Above solution is a combination of this answer and the code generated from this site

Convert string to date using datefinder

An issue occurs when I try to find a date in a .txt file using datefinder. I have the feeling I am unnecessarily switching between data types to obtain the result I desire.
Underneath is a MWE which results in generator object, which in turn is empty when changed to a list. I would like to obtain a datetime in the format %d-%m-%Y.
MWE:
import datefinder
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
for line in f:
if "creation date" in line:
date_line = str(line)
rev_date = datefinder.find_dates(_date_line)
dateutil's parser seems to do a better job:
import dateutil
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
dates = []
for line in f:
try:
dates.append(dateutil.parser.parse(line, fuzzy=True))
except dateutil.parser.ParserError:
pass
print(dates)
# [datetime.datetime(2021, 4, 25, 8, 52, 6)]
For the specific use-case:
for line in f:
if "* Model creation date:" in line:
rev_date = dateutil.parser.parse(line, fuzzy=True)
break
print(rev_date)
# 2021-04-25 08:52:06
Seems datefinder.find_dates works based on :. If you can remove : character after creation date get right result.
If always your string include creation date: you can remove this substring after if statement:
import datefinder
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
for line in f:
if "creation date" in line:
date_line = line.replace('creattion date:', '')
rev_date = datefinder.find_dates(date_line)

Get json surrounded by script tag, using bs4 in python

Im using bs4 for Python, I want to get a json from a web page but its like this:
<script>
vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});
</script>
Using beautifulsoup for python, but there is no class to identify
Thank you
You can simply use the 'script' tag to find the element:
soup = BeautifulSoup('''<script>vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});</script>''', 'html.parser')
js_code = soup.find('script').contents[0]
js_code is then
vtex.events.addData({"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"});
The tricky sketchy part is getting the json from it.
I will rarely root for regex for this kind of tasks, but this is a rare one.
import re
...
js_code = soup.find('script').contents[0]
print(re.search('{.*}', js_code).group(0))
This outputs
{"pageCategory":"Product","pageDepartment":"Calzado","pageUrl":"http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p","pageTitle":"AIR FORCE 1 07 LV8 | MASCULINO - tafmx","skuStockOutFromShelf":[],"skuStockOutFromProductDetail":["23312","23313","23314","23316","23325","23326","23327","23328"],"shelfProductIds":["140","141","142","3775","3777","3782","3785","545","17","314","318","530","645","801","822","940"],"accountName":"tafmx","pageFacets":[],"productId":"3829","productReferenceId":"CU8070-100","productEans":["194502172393","194502172409","194502172416","194502172423","194502172430","194502172447","194502172454","194502172461","194502172478","194502172485","194502172492","194502172508","194502172515","194502172522","194502172539","194502172546","194502172553"],"skuStocks":{"23312":0,"23313":0,"23314":0,"23315":11,"23316":0,"23317":19,"23318":29,"23319":22,"23320":12,"23321":7,"23322":9,"23323":15,"23324":14,"23325":0,"23326":0,"23327":0,"23328":0},"productName":"AIR FORCE 1 07 LV8","productBrandId":2000004,"productBrandName":"Nike","productDepartmentId":7,"productDepartmentName":"Calzado","productCategoryId":8,"productCategoryName":"Sneakers","productListPriceFrom":"2199","productListPriceTo":"2199","productPriceFrom":"2199","productPriceTo":"2199","sellerId":"1","sellerIds":"1"}
Which can be converted to a Python dict using json.loads.
import json
...
print(json.loads(re.search('{.*}', js_code).group(0)))
Outputs
{'pageCategory': 'Product', 'pageDepartment': 'Calzado', 'pageUrl': 'http://www.taf.com.mx/air-force-1-07-lv8-cu8070-100/p', 'pageTitle': 'AIR FORCE 1 07 LV8 | MASCULINO - tafmx', 'skuStockOutFromShelf': [], 'skuStockOutFromProductDetail': ['23312', '23313', '23314', '23316', '23325', '23326', '23327', '23328'], 'shelfProductIds': ['140', '141', '142', '3775', '3777', '3782', '3785', '545', '17', '314', '318', '530', '645', '801', '822', '940'], 'accountName': 'tafmx', 'pageFacets': [], 'productId': '3829', 'productReferenceId': 'CU8070-100', 'productEans': ['194502172393', '194502172409', '194502172416', '194502172423', '194502172430', '194502172447', '194502172454', '194502172461', '194502172478', '194502172485', '194502172492', '194502172508', '194502172515', '194502172522', '194502172539', '194502172546', '194502172553'], 'skuStocks': {'23312': 0, '23313': 0, '23314': 0, '23315': 11, '23316': 0, '23317': 19, '23318': 29, '23319': 22, '23320': 12, '23321': 7, '23322': 9, '23323': 15, '23324': 14, '23325': 0, '23326': 0, '23327': 0, '23328': 0}, 'productName': 'AIR FORCE 1 07 LV8', 'productBrandId': 2000004, 'productBrandName': 'Nike', 'productDepartmentId': 7, 'productDepartmentName': 'Calzado', 'productCategoryId': 8, 'productCategoryName': 'Sneakers', 'productListPriceFrom': '2199', 'productListPriceTo': '2199', 'productPriceFrom': '2199', 'productPriceTo': '2199', 'sellerId': '1', 'sellerIds': '1'}
Note that you may need to use a more complex regex if the script tag contains other things you did not show in the question.

How to extract the file Data in python

I wanted to extract the date from the given string on the basis of tag.
My string is -
DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017
NUMBER: 201707250008754 RATE: 10.00
I want something like this -
If I give "DATE" it should return 7/25/2017 only
if I give "RETURN DATE" it should return 7/26/2017
if I give the "NUMBER" it should return 201707250008754
and so on.
How we can achieve this in Python 2.7 (Note: Dates and numbers are always random in string"
You can create a dictionary from the string's contents with re:
import re
s = 'DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017 NUMBER: 201707250008754 RATE: 10.00'
results = re.findall('[a-zA-Z\s]+(?=:)|[\d/\.]+', s)
d = dict([re.sub('^\s+', '', results[i]), results[i+1]] for i in range(0, len(results), 2))
for i in ['DATE', 'RETURN DATE', 'NUMBER']:
print(d[i])
Output:
7/25/2017
7/26/2017
201707250008754
Use dict to map key (eg: 'DATE' ) to its value.
import re
s = '''DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017 NUMBER: 201707250008754 RATE: 10.00'''
items = re.findall('\s*(.*?)\:\s*([0-9/.]*)',s)
#[('DATE', '7/25/2017'), ('DATE OPENED', '7/25/2017'), ('RETURN DATE', '7/26/2017'), ('NUMBER', '201707250008754'), ('RATE', '10.00')]
info = dict(items)
#{'DATE': '7/25/2017', 'DATE OPENED': '7/25/2017', 'RETURN DATE': '7/26/2017', 'NUMBER': '201707250008754', 'RATE': '10.00'}
for key in ['DATE', 'RETURN DATE', 'NUMBER']:
print(info[key])

How to parse this web page (and convert into a dictionary) in Python

I want to fetch some data from a certain web page (online book store).
The web pages look like this: http://www.rob389.com/dp/tr/11/9789754681383
I want to parse the <script> section that contains the following string:
tOBJ.DATA[0].MMM00_ITEM_CODE="9789754681383"
Where 9789754681383 is the ISBN number.
I first need to find and extract the correct <script> section.
Then, I want to iterate through the content of the tOBJ.DATA[0] object and convert it into a dictionary which would look like:
my_dict["MMM00_ITEM_CODE"] = "9789754681383"
my_dict["MMM00_TITLE"] = "Uykusuz Bir Gece"
And so on...
I guess it's possible to do it with python with few number of lines, but I cannot figure out.
Any suggestions?
Thanks in advance.
PS: Any other parsing suggestions are welcome.
I would almost always recommend the use of BeautifulSoup - however, this page seems to have a '<!-->' tag half-way down which kills the parser. So, re to the rescue...
import urllib
import re
url = 'http://www.rob389.com/dp/tr/11/9789754681383'
txt = urllib.urlopen(url).read()
pat = re.compile(r'tOBJ\.DATA\[0\]\.(\w+)\s*=\s*([^;]*);', flags=re.M|re.DOTALL)
my_dict = dict((k,v.strip('"')) for k,v in pat.findall(txt))
which results in a 132-entry dictionary including
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
If you want to restrict it to only keys beginning with 'MMM00', try
my_dict = dict((k,v.strip('"')) for k,v in pat.findall(txt) if k.startswith('MMM00'))
which only returns 15 items.
You can do it with BeautifulSoup and a little bit of code:
from bs4 import BeautifulSoup
import urllib2
from urlparse import urlsplit
def isbnExtractor(url):
urlContent = url.strip('/').split('/')
print urlContent[6]
And this should do the trick. Like this:
PS python
Python 2.7.2 (default, Jun 12 2011, 15:08:59) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from parse import isbnExtractor
>>> url = "http://www.rob389.com/dp/tr/11/9789754681383"
>>> isbnExtractor(url)
9789754681383
The page is so horribly invalid XML that all the normal approaches using parse() from lxml.tree followed by xpath etc fail miserably. So looks like your best bet is something like:
>>> import re
>>> import urllib
>>> import pprint
>>> s = urllib.urlopen("http://www.rob389.com/dp/tr/11/9789754681383").read()
>>> magic = re.compile(r'tOBJ.DATA\[0\].([A-Z0-9_]+)="([^"]+)"')
>>> my_dict = dict(magic.findall(s))
>>> pprint.pprint(my_dict)
{'DISC_PERC': '15.0000000000',
'EXCHANGE_RT': '2.2815',
'LNK_PREFIX': 'uykusuz-bir-gece-jill-murphy',
'LST_PRICE': '7.500000000000000',
'LST_YAX02_CODE': 'YTL',
'MMG00_CODE': '11',
'MMG00_TITLE': 'Kitap',
'MMM00_DESC': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DESC250': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DISC_PERC_SAL': '25',
'MMM00_HEIGHT': '19.6',
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
'MMM00_ZHEIGHT': '1',
'MMS03_PRICE_1': '7.500000000000000',
'MMS03_PRICE_2': '0.000000000000000',
'MMS03_PRICE_3': '7.500000000000000',
'MMS03_YAX02_CODE_1': 'YTL',
'MMS03_YAX02_CODE_2': 'YTL',
'MMS03_YAX02_CODE_3': 'YTL',
'NWS01_DESC': "<BR>New Orleans'da do\xf0an Lillian Hellman'\xfdn ilkgen\xe7li\xf0i daha sonra oyunlar\xfdnda \xfcst\xfc kapal\xfd olarak yer bulacak olan tuhaf ve h\xfdrsl\xfd akrabalar aras\xfdnda ge\xe7ti. New Orleans ve New York aras\xfdnda mekik dokuyarak ge\xe7en y\xfdllarda etraf\xfdndaki farkl\xfd k\xfclt\xfcrleri g\xf6zlemleme \xfeans\xfd buldu. Liseyi bitirdikten sonra Columbia ve New York \xdcniversitesi'ne devam ettiyse de, e\xf0itimini yar\xfdda b\xfdrakarak bir yay\xfdnevinde \xe7al\xfd\xfemaya ba\xfelad\xfd. 1920'lerin bohem hayat\xfdna g\xf6z k\xfdrpt\xfd\xf0\xfd bu d\xf6nemde tan\xfd\xfet\xfd\xf0\xfd gen\xe7 yazar Arthur Kober ile evlenerek Hollywood'a ta\xfe\xfdnd\xfd. <BR><BR>1930'lar\xfdn ba\xfe\xfdnda MGM'de d\xfczeltmenlik yapt\xfd. Hevesli bir solcu oldu\xf0u bu y\xfdllarda, i\xfe arkada\xfelar\xfdn\xfd sendikala\xfemalar\xfd i\xe7in<A class=A2 href=\\",
'NWS01_DESC400': '<A class=A3 href=\\',
'NWS01_ID': '588',
'NWS01_IMAGE': '/UD_OBJS/IMAGES/NWS/HSTTR/Hellman_L_231204_365_1.jpg',
'ON_ESHOP': 'T',
'PEP01_ID': '229016',
'PEP01_NAME': 'Jill Murphy',
'PRD_FNM01_ID': '23462',
'PRD_FNM01_TITLE': 'Mandolin',
'PRD_FNM01_TRD_TITLE': 'Say Yay\xfdnlar\xfd',
'PUR_VAT_VALUE': '8',
'SAL_PRICE': '6.3750000000',
'SAL_VAT_VALUE': '8',
'SAL_YAX02_CODE': 'YTL',
'UD_10': '~410~|',
'UD_10_VAL': 'T\xfcrk\xe7e',
'UD_11': '~1000~|~803.2~|',
'UD_11_VAL': '\xc7ocuk,\xd6yk\xfc',
'UD_12': '~1000.4080~|',
'UD_12_VAL': '\xc7ocuk | 07-12 Ya\xfe | Edebiyat',
'UD_15': '978-975-468-138-3',
'UD_15_VAL': '978-975-468-138-3',
'UD_16': '~PB~|',
'UD_16_VAL': 'Karton Kapak',
'UD_19': '01/01/2010',
'UD_19_VAL': '01/01/2004',
'UD_2': 'The Worst Witch Strikes Again',
'UD_20': '92',
'UD_20_VAL': '92',
'UD_21': '52',
'UD_21_VAL': '52',
'UD_22': '3',
'UD_22_VAL': '3',
'UD_23': '1',
'UD_23_VAL': '1',
'UD_24': '~HM1~|',
'UD_24_VAL': '1. Hamur',
'UD_26': '7-12',
'UD_26_VAL': '07-12',
'UD_2_VAL': 'The Worst Witch Strikes Again',
'UD_3': '~229016~|',
'UD_30': '1',
'UD_30_VAL': '1',
'UD_31': '1',
'UD_31_VAL': '1',
'UD_34': '~1~|',
'UD_34_VAL': '1-3 G\xfcn',
'UD_36': '1',
'UD_36_VAL': '1',
'UD_39': 'VAR',
'UD_39_VAL': 'Var',
'UD_3_VAL': 'Jill Murphy',
'UD_42': '~410~|',
'UD_42_VAL': 'T\xfcrk\xe7e',
'UD_6': '~239986~|',
'UD_6_VAL': 'Seza Sunar',
'YAX02_CODE': 'EUR'}
>>>

Categories