I want to remove troubled test_data list elements, which contains bad_characters and fixed characters to append into the new list stripped_test_data but the script does not work.
The following code:
test_data = ["1912", "1929", "1913-1923",
"(1951)", "1994", "1934",
"c. 1915", "1995", "c. 1912",
"(1988)", "2002", "1957-1959",
"c. 1955.", "c. 1970's",
"C. 1990-1999"]
bad_chars = ["(",")","c","C",".","s","'", " "]
def strip_characters(data):
stripped_test_data = []
for each in data:
if bad_chars in each:
tostr = str(each)
adjusted = tostr.replace(bad_chars, "")
stripped_test_data.append(tostr)
else:
adjusted = each
stripped_test_data.append(each)
return stripped_test_data
adjsuted_data = strip_characters(test_data)
when run throws the error:
> > TypeErrorTraceback (most recent call last) <ipython-input-1-d9d5a3a4542a> in <module>()
> 20 return stripped_test_data
> 21
> ---> 22 adjsuted_data = strip_characters(test_data)
> 23
> 24
>
> <ipython-input-1-d9d5a3a4542a> in strip_characters(data)
> 11 stripped_test_data = []
> 12 for each in data:
> ---> 13 if bad_chars in each:
> 14 tostr = str(each)
> 15 adjusted = tostr.replace(bad_chars, "")
TypeError: 'in <string>' requires string as left operand, not list
Can you please help what is wrong with the code and how to proceeds the operation?
str.strip can handle multiple characters:
bad_chars_joined = ''.join(bad_chars)
[t.strip(bad_chars_joined) for t in test_data]
Output:
['1912',
'1929',
'1913-1923',
'1951',
'1994',
'1934',
'1915',
'1995',
'1912',
'1988',
'2002',
'1957-1959',
'1955',
'1970',
'1990-1999']
your code is trying to compare the entire list of bad chars when using the in bad_chars
try this:
test_data = ["1912", "1929", "1913-1923",
"(1951)", "1994", "1934",
"c. 1915", "1995", "c. 1912",
"(1988)", "2002", "1957-1959",
"c. 1955.", "c. 1970's",
"C. 1990-1999"]
bad_chars = ["(",")","c","C",".","s","'", " "]
def strip_characters(data):
stripped_test_data = []
for char in bad_chars:
for each in data:
if char in each:
tostr = str(each)
adjusted = tostr.replace(char, "")
stripped_test_data.append(adjusted)
else:
stripped_test_data.append(each)
return stripped_test_data
adjsuted_data = strip_characters(test_data)
Related
I need it to extract the word that starts with a capital letter, if and only if, this word is preceded by the beginning of the sentence or by one of these options (?:,and|and|her friends|,or |or |,)
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
match_names = match
print("match names: ")
print(match_names)
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs : ")
for name in match_names:
match_aux = match_names
for m in match_aux:
if (m == name):
match_aux[n] = ""
n += 1
n = 0
print(match_aux)
I need that output lists:
match names:
['Susan', 'Lisa', 'Veronica', 'Katy']
match_auxs :
['','Lisa', 'Veronica', 'Katy']
['Susan', '', 'Veronica', 'Katy']
['Susan', 'Lisa', '', 'Katy']
['Susan', 'Lisa', 'Veronica', '']
But I get this ( and it's wrong)...
match names:
['Susan', 'Lisa', 'Veronica', 'Katy']
match_auxs :
['', 'Lisa', 'Veronica', 'Katy']
['', '', 'Veronica', 'Katy']
['', '', '', 'Katy']
['', '', '', '']
As said in the comments, assigning a list to another variable doesn't create a copy of it. Along with this, your code can be simplified by using functions like enumerate:
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match_names := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
print(f"match names: {match_names}")
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs: ")
for index, name in enumerate(match_names):
match_aux = match_names.copy()
match_aux[index] = ""
n = 0
print(match_aux)
If you don't want to use copy on the list (for speed), this code will also work:
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match_names := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
print(f"match names: {match_names}")
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs: ")
prev = ""
for index, name in enumerate(match_names):
if index > 0:
match_names[index - 1] = prev
prev = match_names[index]
match_names[index] = ""
print(match_names)
match_names[-1] = prev
I have a rule-based code that prints out the Noun which is followed by a verb in a sentence
for text_id, text in enumerate(news_df['news_title'].values):
# Remove the comma and full stops
text = text.replace(',', '').replace('.', '').replace('-','')
sentence_tags = POSTAG(text.lower())
print(text)
# Sentences parts
for index, part in enumerate(sentence_tags):
try:
if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]:
print(">", part[0])
break
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]:
print(">", part[0], sentence_tags[index + 1][0])
break
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]:
print(">", part[0], sentence_tags[index + 1][0], sentence_tags[index + 2][0])
break
except:
pass
print()
The output of a sentence following this rule:
high school football players charged after video surfaces showing hazing
> school football players
trump accuser pushes new york to pass the adult survivors act plans to sue
>trump accuser
Is there a way to also print out the position of that Noun that was printed due to the rule?
for example :
>trump accuser , [0,5,"NN"] , [6,13,"VB"]
I changed the script and separated the state machine segment. The most serious problem with this program IMO is it's just returning the first pattern (you can fix it quickly).
import pandas as pd
import nltk
POSTAG = nltk.pos_tag
df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
for text_id, text in enumerate(df['text'].values):
# Remove the comma and full stops
text = text.replace(',', '').replace('.', '').replace('-','')
tokens = nltk.word_tokenize(text.lower())
sentence_tags = POSTAG(tokens)
words = [item[0] for item in sentence_tags]
start_end = []
temp = 0
for word in words:
start_end.append([temp, temp+len(word)])
temp+= (len(word)+1)
tags = [item[1] for item in sentence_tags]
words_to_print = []
tags_to_print = []
start_end_to_print = []
# the state machine
verb = False
first_noun = False
second_noun = False
third_noun = False
for w, t, se in zip(words, tags, start_end):
if t.startswith('NN'):
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
first_noun = True
elif t.startswith('NN') and first_noun:
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
second_noun = True
elif t.startswith('NN') and second_noun:
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
third_noun = True
elif t.startswith('VB') and (first_noun or second_noun or third_noun):
break
elif (first_noun or second_noun or third_noun):
words_to_print = []
tags_to_print = []
start_end_to_print = []
verb = False
first_noun, second_noun, third_noun = False, False, False
print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))
output:
> school football players [5, 11] NN [12, 20] NN [21, 28] NNS
> trump accuser [0, 5] NN [6, 13] NN
I have a complex list of lists that looks like that :
[[['MARIA DUPONT',
' infos : ',
[' age = 28',
' yeux = bleus',
' sexe = femme']],
[' + ']],
[['PATRICK MARTIN',
' infos : ',
[' age = 53',
' yeux = marrons',
' sexe = homme']],
[' + ']],
[['JULIE SMITH',
' infos : ',
[' age = 17',
'yeux = verts',
'sexe = femme']],
[' fin ']]]
I am trying to transform it into a string. At the end I want to print that :
MARIA DUPONT,
infos :
age = 28
yeux = bleus
sexe = femme
+
PATRICK MARTIN
infos :
age = 53
yeux = marrons
sexe = homme
+
JULIE SMITH
infos :
age = 17
yeux = verts
sexe = femme
fin
My real data are more complicated and I have lists into level 5.
So I am looking for a way to solve the problem I explained to be able to adapt it and apply it to my real data.
I am trying with
''.join(list)
and
''.join(x for x in list)
But in both cases I have the error TypeError: list indices must be integers or slices, not list
I've tryed other ways but now I'm confused and I didn't found a good solution to reach my goal.
Any help would be appreciated, and thanks in advance. (and sorry for my bad english!)
You can use str.join with a single pass over the lists:
data = [[['MARIA DUPONT', ' infos : ', [' age = 28', ' yeux = bleus', ' sexe = femme']], [' + ']], [['PATRICK MARTIN', ' infos : ', [' age = 53', ' yeux = marrons', ' sexe = homme']], [' + ']], [['JULIE SMITH', ' infos : ', [' age = 17', 'yeux = verts', 'sexe = femme']], [' fin ']]]
r = '\n'.join('\n'.join([a, b, *c, f'\n{k}\n']) for [a, b, c], [k] in data)
Output:
MARIA DUPONT
infos :
age = 28
yeux = bleus
sexe = femme
+
PATRICK MARTIN
infos :
age = 53
yeux = marrons
sexe = homme
+
JULIE SMITH
infos :
age = 17
yeux = verts
sexe = femme
fin
If your lists are arbitrarily nested, then you can use recursion with a generator:
def flatten(d):
if isinstance(d, str):
yield d
else:
yield from [i for b in d for i in flatten(b)]
print('\n'.join(flatten(data)))
.join() won't work with a list in the list. I can offer you a solution based on recursion.
def list_to_str(_list):
result = ""
if isinstance(_list, list):
for l in _list:
result += list_to_str(l)
else:
result += _list
return result
result_string = list_to_str(your_list)
print(result_string)
I can't tell if you have a list with varying levels of lists but if so, you would probably need a conditional to see if the list goes further and recursively iterate the list.
def convert_list(dataset):
result = ''
for element in dataset:
if isinstance(element, list):
result += convert_list(element)
else:
result += str(element)
return result
This will not print the newlines you want but it does return the list as a string.
Write a recursive function to get inside your lists like below:
def print_data(input_list):
for obj in input_list:
if isinstance(obj, list):
print_data(obj)
else:
print(obj)
input_list = [[['MARIA DUPONT',
' infos : ',
[' age = 28',
' yeux = bleus',
' sexe = femme']],
[' + ']],
[['PATRICK MARTIN',
' infos : ',
[' age = 53',
' yeux = marrons',
' sexe = homme']],
[' + ']],
[['JULIE SMITH',
' infos : ',
[' age = 17',
'yeux = verts',
'sexe = femme']],
[' fin ']]]
print_data(input_list)
Here I have a text file. I want to read Adress, Beneficiary, Beneficiary Bank, Acc Nbr, Total US$, Date which is at the top, RUT, BOX. I tried writing some code by myself but I am not able to correctly get the required information and moreover if the length of character changes I will not get correct output. How should I do this such that I will get every required information in a particular string.
The main problem will arise when my slicings will go wrong. For eg: I am using line[31:] for Acc Nbr. But if the address change then my slicing will also go wrong
My Text.txt
2014-11-09 BOX 1531 20140908123456 RUT 21 654321 0123
Girry S.A. CONTADO
G 5 Y Serie A
NO 098765
11 al Rayo 321 - Oqwerty 108 Monteaudio - Gruguay
Pharm Cosco, Inc - Britania PO Box 43215
Dirección Hot Springs AR 71903 - Estados Unidos
Oescripción Importe
US$
DO 7640183 - 50% of the Production Degree 246,123
Beneficiary Bank: Bankue Heritage (Gruguay) S.A Account Nbr: 1234563 Swift: MANIUYMM
Adress: Tencon 108 Monteaudio, Gruguay.
Beneficiary: Girry SA Acc Nbr: 1234567
Servicios prestados en el exterior, exentos de IVA o IRAE
Subtotal US$ 102,500
Iva US$ ---------------
Total US$ 102,500
I.V.A AL DIA Fecha de Vencimiento
IMPRENTA IRIS LTDA. - RUT 210161234015 - 0/40987 17/11/2015
CONSTANCIA N9 1234559842 -04/2013
CONTADO A 000.001/ A 000.050 x 2 VIAS
QWERTYAS ZXCVBIZADA
R. U.T. Bamprador Asdfumldor Final
Fecha 12/12/2014
1º ORIGINAL CLLLTE (Blanco) 2º CASIA AQWERVO (Rosasd)
My Code:
txt = 'Text.txt'
lines = [line.rstrip('\n') for line in open(txt)]
for line in lines:
if 'BOX' in line:
Date = line.split("BOX")[0]
BOX = line.split('BOX ', 1)[-1].split("RUT")[0]
RUT = line.split('RUT ',1)[-1]
print 'Date : ' + Date
print 'BOX : ' + BOX
print 'RUT : ' + RUT
if 'Adress' in line:
Adress = line[8:]
print 'Adress : ' + Adress
if 'NO ' in line:
Invoice_No = line.split('NO ',1)[-1]
print 'Invoice_No : ' + Invoice_No
if 'Swift:' in line:
Swift = line.split('Swift: ',1)[-1]
print 'Swift : ' + Swift
if 'Fecha' in line and '/' in line:
Invoice_Date = line.split('Fecha ',1)[-1]
print 'Invoice_Date : ' + Invoice_Date
if 'Beneficiary Bank' in line:
Beneficiary_Bank = line[18:]
Ben_Acc_Nbr = line.split('Nbr: ', 1)[-1]
print 'Beneficiary_Bank : ' + Beneficiary_Bank.split("Acc")[0]
print 'Ben_Acc_Nbr : ' + Ben_Acc_Nbr.split("Swift")[0]
if 'Beneficiary' in line and 'Beneficiary Bank' not in line:
Beneficiary = line[13:]
print 'Beneficiary : ' + Beneficiary.split("Acc")[0]
if 'Acc Nbr' in line:
Acc_Nbr = line.split('Nbr: ', 1)[-1]
print 'Acc_Nbr : ' + Acc_Nbr
if 'Total US$' in line:
Total_US = line.split('US$ ', 1)[-1]
print 'Total_US : ' + Total_US
Output:
Date : 2014-11-09
BOX : 1531 20140908123456
RUT : 21 654321 0123
Invoice_No : 098765
Swift : MANIUYMM
Beneficiary_Bank : Bankue Heritage (Gruguay) S.A
Ben_Acc_Nbr : 1234563
Adress : Tencon 108 Monteaudio, Gruguay.
Beneficiary : Girry SA
Acc_Nbr : 1234567
Total_US : 102,500
Invoice_Date : 12/12/2014
Some Code Changes
I have made some changes but still I am not convinced as I need to provide spaces also in split.
I would recommend you to use regular expressions to extract information you need. It helps to avoid the calculation of the numbers of offset characters.
import re
with open('C:\Quad.txt') as f:
for line in f:
match = re.search(r"Acc Nbr: (.*?)", line)
if match is not None:
Acc_Nbr = match.group(1)
print Acc_Nbr
# etc...
you can search to obtain index of it. for example:
if 'Acc Nbr' in line:
Acc_Nbr = line[line.find("Acc Nbr") + 10:]
print Acc_Nbr
note that find gives you index of first char of item you searched.
I need to create a program which removes punctuation, some specific words, duplicates and return the words left and their respective lines. I also need to keep track of the duplicates. For instance,
Python IDLE
Indexer: type in lines, finish with a . at start of line only
It is a briskly blowing wind that blows
from the north, the North of my youth.
The wind is cold too, colder than the
winds of yesteryear.
.
The index is:
brisk 1
blow 1
wind 1, 3, 4
north 2
youth 2
cold 3
yesteryear 4
The Problem: I need to keep track of the line number of the words left and also their duplicates. I'm not being able to do that.
from string import *
stopWords = [ "a", "i", "it", "am", "at", "on", "in", "to", "too", "very", \
"of", "from", "here", "even", "the", "but", "and", "is", "my", \
"them", "then", "this", "that", "than", "though", "so", "are" ]
endings = [ "es" , "ed" , "er", "ly"]
punctuation = [ ".", "," , ":" , ";" , "!" , "?" , "&" , "'" ]
unindexed_sentence = raw_input("type in lines, finish with a . at start of line only").lower()
#removing duplicates.
def unique_string(l):
ulist = []
ulist2 = []
[ulist.append(x) for x in l if x not in ulist]
[ulist2.append(x)]
global ulist2
return ulist
unindexed_sentence =' '.join(unique_string(unindexed_sentence.split()))
unindexed_sentence1 = split(unindexed_sentence,"\n")
list_unindexed = []
# splitting
i = 0
while i<len(unindexed_sentence1):
list_unindexed += [split(unindexed_sentence1[i])]
i+=1
countline = 0
i = 0
while i < len(list_unindexed):
j = 0
while j < len(list_unindexed[i]):
if list_unindexed[i][j][0] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:0]
if list_unindexed[i][j][-1] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-1] == "s":
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-2:] in endings:
list_unindexed[i][j] = list_unindexed[i][j][:-2]
if list_unindexed[i][j][-3:] == "ing":
list_unindexed[i][j] = list_unindexed[i][j][:-3]
if list_unindexed[i][j] in stopWords:
del list_unindexed[i][j]
else:
j += 1
i += 1
countline += 1
def new_line(n):
split(n,"\n")
count = 1
if n[-1] == "\n":
count += 1
return count
string1 = str(list_unindexed)
string2 = str(string1)
string2 ='\n'.join(unique_string(string2.split()))
print string2
Is it your homework?
Here some tips:
Don't do: from string import *. You don't need it.
Use data.splitlines() to get list of lines
Use enumerate() to get a index, e.g.: for i, line in enumerate(data.splitlines())
Use a dictionary for keeping track of all words. Each value could be a list or a set of line numbers
Don't remove duplicates initially. You can do this using dictionaries or sets.