multiple separator in a string python - python

text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
I have this kind of string. I am facing the problem which splits it to 2 lists. Output will be approximately like this :
name = ['Brand','Color','Type','Power Source']
value = ['Smart Plane','Yellow','Sandwich Maker','Electrical']
Is there any solution for this.

name = []
value = []
text = text.split('.#/')
for i in text:
i = i.split('.*/')
name.append(i[0])
value.append(i[1])

This is one approach using re.split and list slicing.
Ex:
import re
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
data = [i for i in re.split("[^A-Za-z\s]+", text) if i]
name = data[::2]
value = data[1::2]
print(name)
print(value)
Output:
['Brand', 'Color', 'Type', 'Power Source']
['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical']

You can use regex to split the text, and populate the lists in a loop.
Using regex you protect your code from invalid input.
import re
name, value = [], []
for ele in re.split(r'\.#\/', text):
k, v = ele.split('.*/')
name.append(k)
value.append(v)
>>> print(name, val)
['Brand', 'Color', 'Type', 'Power Source'] ['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical.']

text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
name=[]
value=[]
word=''
for i in range(len(text)):
temp=i
if text[i]!='.' and text[i]!='/' and text[i]!='*' and text[i]!='#':
word=word+''.join(text[i])
elif temp+1<len(text) and temp+2<=len(text):
if text[i]=='.' and text[temp+1]=='*' and text[temp+2]=='/':
name.append(word)
word=''
elif text[i]=='.' and text[temp+1]=='#' and text[temp+2]=='/':
value.append(word)
word=''
else:
value.append(word)
print(name)
print(value)
this will be work...

Related

How to split the given 'key-value' list into two lists separated as 'keys' and 'values' with python

This is my List
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
I need to separate the List into two differnt List's sepearted as 'keys' and 'values'
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
KeyList
KeyList = ['function', 'string', 'hello', 'new', 'test']
ValueList
ValueList = ['function1', 'string1', 'hello1', 'new1', 'test1']
There are different possible approach. One is the method proposed by Tim, but if you are not familiar with re you could also do:
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
KeyList = []
ValueList = []
for item in List:
val = item.split(' = ')
KeyList.append(val[0])
ValueList.append(val[1])
print(KeyList)
print(ValueList)
and the output is:
['function', 'string', 'hello', 'new', 'test']
['function1', 'string1', 'hello1', 'new1', 'test1']
You can simply use split(" = ") and unzip the list of key-value pairs to two tuples:
keys, values = zip(*map(lambda s: s.split(" = "), List))
# keys
# >>> ('function', 'string', 'hello', 'new', 'test')
# values
# >>>('function1', 'string1', 'hello1', 'new1', 'test1')
This is based on the fact that zip(*a_zipped_iterable) works as an unzipping function.
We can use re.findall here:
inp = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
keys = [re.findall(r'(\w+) =', x)[0] for x in inp]
vals = [re.findall(r'\w+ = (\w+)', x)[0] for x in inp]
keys = [pair[0] for pair in pairs]
values = [pair[1] for pair in pairs]

How to distribute comma separated element to form a list in python

How to extract/split multi-line comment to make a new list
clientInfo="""James,Jose,664 New Avenue,New Orleans,Orleans,LA,8/27/200,123,jjose#gmail.com,;
Shenna,Laureles, 288 Livinghood Heights,Brighton,Livingston,MI,2/19/75,laureles9219#yahoo.com,;
"""
into this kind of list
f_name = ["james","sheena"]
l_name = ["jose","Laureles"]
strt = ["664 New Avenue","288 Livinghood Heights"]
cty = ["New Orleans","Brighton"]
state = ["New Orleans","Livingston"]
If the order is always same. You could do something like this;
f_name = []
l_name = []
strt = []
cty = []
state = []
for client in clientData.split(";\n "):
client_ = client.split(",")
f_name.append(client_[0])
l_name.append(client_[1])
strt.append(client_[2])
cty.append(client_[3])
state.append(client_[4])
I could add some exception handling to handle the ; at the end of your string but, leaving that to you.
You can use split and zip.
def extract(string):
lines = string.split(";")
split_lines = tuple(map(lambda line: line.split(","), lines))
no_space1 = tuple(map(lambda item: item.strip(), split_lines[0]))
no_space2 = tuple(map(lambda item: item.strip(), split_lines[1]))
return list(zip(no_space1, no_space2))
This will produce
[('James', 'Shenna'), ('Jose', 'Laureles'), ('664 New Avenue', '288 Livinghood Heights'), ('New Orleans', 'Brighton'), ('Orleans', 'Living
ston'), ('LA', 'MI'), ('8/27/200', '2/19/75'), ('123', 'laureles9219#yahoo.com'), ('jjose#gmail.com', '')]
It has some tuples at the end you didn't ask for, but its relatively good. The no_space 1 and 2 lines are a bit repetitive, but cramming them into one line is worse in my opinion.
You can try:
clientData = """James,Jose,664 New Avenue,New Orleans,Orleans,LA,8/27/200,123,jjose#gmail.com,;
Shenna,Laureles, 288 Livinghood Heights,Brighton,Livingston,MI,2/19/75,laureles9219#yahoo.com,;
"""
data = clientData.split(";\n")
f_name = []
l_name = []
strt = []
cty = []
state = []
for data_line in data:
data_line = data_line.strip()
if len(data_line) >= 5:
line_info = data_line.split(",")
f_name.append(line_info[0].strip())
l_name.append(line_info[1].strip())
strt.append(line_info[2].strip())
cty.append(line_info[3].strip())
state.append(line_info[4].strip())
print(f_name)
print(l_name)
print(strt)
print(cty)
print(state)
Output:
['James', 'Shenna']
['Jose', 'Laureles']
['664 New Avenue', '288 Livinghood Heights']
['New Orleans', 'Brighton']
['Orleans', 'Livingston']

Categorizing sentence using dictionary

I am using below function for getting categorizing sentence in themes
def theme(x):
output =[]
category = ()
for i in x:
if 'AC' in i:
category = 'AC problem'
elif 'insects' in i:
category = 'Cleanliness'
elif 'clean' in i:
category = 'Cleanliness'
elif 'food' in i:
category = 'Food Problem'
elif 'delay' in i:
category = 'Train Delayed'
else:
category = 'None'
output.append(category)
return output
I don't want to use repeated if statements for every word in a category. Instead I want the i give a list/dictionary e.g. Cleanliness = ['Clean', 'Cleaned', 'spoilt', 'dirty'] for getting category 'Cleanliness' against the sentence if it has any of the words in list. How can i do that
You can use a dict of sets to structure your words with categories, and then generate a word-to-category lookup dict based on the said structure:
categories = {
'Cleanliness': {'insects', 'clean'},
'AC Problem': {'AC'},
'Food Problem': {'food'},
'Train Delayed': {'delay'}
}
lookup = {word: category for category, words in categories.items() for word in words}
def theme(x):
return {lookup.get(word, 'None') for word in x}
so that theme(['AC', 'clean', 'insects']) would return a set of corresponding categories:
{'Cleanliness', 'AC Problem'}
This should do what you're asking. I set all the keys to lowercase and converted i to lowercase when checking if you get a match, but with different capitalization, it still counts.
def theme(x):
output =[]
category = ()
myDict = {"ac":"AC problem", "insects":"Cleanliness", "clean":"Cleanliness", "food":"Food Problem", "delay":"Train Delayed"} #I reccomend coming up with a more suitable name for your dictionary in your actual program
for i in x:
if i.lower() in myDict: #Checks to see if i is in the dictionary before trying to print the result; prevents possible Key Errors
category = (myDict[i.lower()]) #If it is in the dictionary it category will be set to the result of the key
output.append(category)
else:
output.append("None") #If i isn't in the dictionary output will append None instead
return output
Here's some examples:
>>>print(theme(['Clean', 'Cleaned', 'spoilt', 'dirty']))
['Cleanliness', 'None', 'None', 'None']
>>>print(theme(['Delay', 'Ham', 'Cheese', 'Insects']))
['Train Delayed', 'None', 'None', 'Cleanliness']
I have worked out a another way:
def theme(x):
output = []
for i in x:
if set(cleanliness).intersection(i.lower().split()):
category = 'clean'
elif set(ac_problem).intersection(i.lower().split()):
category = 'ac problem'
else:
category = 'none'
output.append(category)
return output
Maybe you can do it like this:
def theme(x):
output = []
name_dic = {"AC": "AC problem",
"clean": "Cleanliness",
"food": "Food Problem"
}
for e in x:
output.append(name_dic.get(e))
return output
Or more exactly like this:
def theme(x):
output = []
name_list = [
("AC", "AC problem"),
("clean", "Cleanliness"),
("insects", "Cleanliness"),
("food", "Food Problem")
]
name_dic = dict(name_list)
for e in x:
output.append(name_dic.get(e))
return output
Hope it helps.

Trouble getting right values against each item

I'm trying to parse the item names and it's corresponding values from the below snippet. dt tag holds names and dd containing values. There are few dt tags which do not have corresponding values. So, all the names do not have values. What I wish to do is keep the values blank against any name if the latter doesn't have any values.
These are the elements I would like to scrape data from:
content="""
<div class="movie_middle">
<dl>
<dt>Genres:</dt>
<dt>Resolution:</dt>
<dd>1920*1080</dd>
<dt>Size:</dt>
<dd>1.60G</dd>
<dt>Quality:</dt>
<dd>1080p</dd>
<dt>Frame Rate:</dt>
<dd>23.976 fps</dd>
<dt>Language:</dt>
</dl>
</div>
"""
I've tried like below:
soup = BeautifulSoup(content,"lxml")
title = [item.text for item in soup.select(".movie_middle dt")]
result = [item.text for item in soup.select(".movie_middle dd")]
vault = dict(zip(title,result))
print(vault)
It gives me messy results (wrong pairs):
{'Genres:': '1920*1080', 'Resolution:': '1.60G', 'Size:': '1080p', 'Quality:': '23.976 fps'}
My expected result:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p','Frame Rate:':'23.976 fps','Language:':''}
Any help on fixing the issue will be highly appreciated.
You can loop through the elements inside dl. If the current element is dt and the next element is dd, then store the value as the next element, else set the value as empty string.
dl = soup.select('.movie_middle dl')[0]
elems = dl.find_all() # Returns the list of dt and dd
data = {}
for i, el in enumerate(elems):
if el.name == 'dt':
key = el.text.replace(':', '')
# check if the next element is a `dd`
if i < len(elems) - 1 and elems[i+1].name == 'dd':
data[key] = elems[i+1].text
else:
data[key] = ''
You can use BeautifulSoup to parse the dl structure, and then write a function to create the dictionary:
from bs4 import BeautifulSoup as soup
import re
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a[4:-5], _d[0][4:-5]]
d = _d[1:]
else:
yield [a[4:-5], '']
d = _d
else:
yield [a[4:-5], '']
d = []
print(dict(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1])))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
For a slightly longer, although cleaner solution, you can create a decorator to strip the HTML tags of the output, thus removing the need for the extra string slicing in the main parse_result function:
def strip_tags(f):
def wrapper(data):
return {a[4:-5]:b[4:-5] for a, b in f(data)}
return wrapper
#strip_tags
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a, _d[0]]
d = _d[1:]
else:
yield [a, '']
d = _d
else:
yield [a, '']
d = []
print(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1]))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
from collections import defaultdict
test = soup.text.split('\n')
d = defaultdict(list)
for i in range(len(test)):
if (':' in test[i]) and (':' not in test[i+1]):
d[test[i]] = test[i+1]
elif ':' in test[i]:
d[test[i]] = ''
d
defaultdict(list,
{'Frame Rate:': '23.976 fps',
'Genres:': '',
'Language:': '',
'Quality:': '1080p',
'Resolution:': '1920*1080',
'Size:': '1.60G'})
The logic here is that you know that every key will have a colon. Knowing this, you can write an if else statement to capture the unique combinations, whether that is key followed by key or key followed by value
Edit:
In case you wanted to clean your keys, below replaces the : in each one:
d1 = { x.replace(':', ''): d[x] for x in d.keys() }
d1
{'Frame Rate': '23.976 fps',
'Genres': '',
'Language': '',
'Quality': '1080p',
'Resolution': '1920*1080',
'Size': '1.60G'}
The problem is that empty elements are not present. Since there is no hierarchy between the <dt> and the <dd>, I'm afraid you'll have to craft the dictionary yourself.
vault = {}
category = ""
for item in soup.find("dl").findChildren():
if item.name == "dt":
if category == "":
category = item.text
else:
vault[category] = ""
category = ""
elif item.name == "dd":
vault[category] = item.text
category = ""
Basically this code iterates over the child elements of the <dl> and fills the vault dictionary with the values.

Multiple split of input?

I know that you can use split() to split a user input into two, but how would you split input that consists of multiple variables ? For example:
User input:
Shawn=14:soccer#2991842
What I would like to do:
name = Shawn
age = 14
course = soccer
idnumber = 2991842
What's the best way to do such thing ?
str = 'Shawn=14:soccer#2991842'
keys = ['name', 'age', 'course', 'idnumber']
values = re.split('[=:#]', str)
print dict(zip(keys, values))
Out[114]: {'age': '14', 'course': 'soccer', 'idnumber': '2991842', 'name': 'Shawn'}
I think Regex will work best here:
>>> from re import split
>>> mystr = "Shawn=14:soccer#2991842"
>>> split("\W", mystr)
['Shawn', '14', 'soccer', '2991842']
>>> lst = split("\W", mystr)
>>> name = lst[0]
>>> name
'Shawn'
>>> age = lst[1]
>>> age
'14'
>>> course = lst[2]
>>> course
'soccer'
>>> idnumber = lst[3]
>>> idnumber
'2991842'
>>>
Also, the above is a step-by-step demonstration. You can actually just do:
name, age, course, idnumber = split("\W", mystr)
Here's how I would do it.
def splitStr(str):
temp = str.split(':')
temp_nameAge = temp[0].split('=')
temp_courseId = temp[1].split('#')
name = temp_nameAge[0]
age = int(temp_nameAge[1])
course = temp_courseId[0]
idnumber = int(temp_courseId[1])
print 'Name = %s, age = %i, course = %s, id_number = %i' % (name, age, course, idnumber)
Another thing you can do is use split like: string.split(":").
Then you can change the format to "name:age:course:number"
You could just keep splitting the splits...
text2split = "Shawn=14:soccer#2991842"
name = text2split.split('=')[0]
age = text2split.split('=')[1].split(':')[0]
course = text2split.split('=')[1].split(':')[1].split('#')[0]
idnumber = text2split.split('=')[1].split(':')[1].split('#')[1]
This isn't the most elegant way to do it, but it'll work so long as text2split always has the same delimeters.
If you are ok with storing them under dictionary keys, you could use named group references
import re
x='shawn=14:soccer#2991842'
re.match(r'(?P<name>.*?)=(?P<age>.*):(?P<course>.*?)#(?P<idnumber>.*)', x).groupdict()
{'idnumber': '2991842', 'course': 'soccer', 'age': '14', 'name': 'shawn

Categories