Related
Below is my example code:
from fuzzywuzzy import fuzz
import json
from itertools import zip_longest
synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th
Generation)","hard
drive","ddr 8gb","something1", "something2",
"something3","HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
for k,k2 in zip_longest(vendor_data,buyer_data):
for v in value:
if fuzz.token_set_ratio(k,v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item+" "+k)
else:
#didnt get only "something" strings here !
if fuzz.token_set_ratio(k2,v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item+" "+k2)
vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer
Note: "something" string can be anything like "battery" or "display"etc
synonyms json
{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7
processor","processor i5","processor i7","core generation","core gen"],
"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR
32gb","DDR 32 gb","DDR4-"],
"ssd":["solid state drive","solid drive"],
"hdd":["Hard Drive"]
}
what do i need ?
I want to add all "something" string inside vendor list dynamically.
! NOTE -- "something" string can be anything in future.
I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.
for example like below:
current output
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state']
expected output below
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state',
'something1',
'something2'
'something3'] #something string need to be added in vendor list dynamically.
what silly mistake am I doing ? Thank you.
Here's my attempt:
from fuzzywuzzy import process, fuzz
synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
def find_synonym(s: str, min_score: int = 60):
results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
if not results:
return None
return results[0][-1]
def process_data(l: list, min_score: int = 60):
matches = []
no_matches = []
for item in l:
syn = find_synonym(item, min_score=min_score)
if syn is not None:
new_item = f'{syn} {item}' if syn not in item else item
matches.append(new_item)
elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
# one of the synonyms is already in the item string
matches.append(item)
else:
no_matches.append(item)
return matches, no_matches
For process_data(vendor_data) we get:
(['i7 processor',
'ssd solid state',
'processor Corei5 :1135G7 (11th Generation)',
'hdd hard drive',
'ram ddr 8gb',
'ram HT (100W) DDR4-2400'],
['something1', 'something2', 'something3'])
And for process_data(buyer_data):
(['i7 processor 12 generation', 'processor corei7:latest technology'], [])
I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:
matches, no_matches = process_data(vendor_data)
matches + no_matches # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']
I have tried to come up with a decent answer (certainly not the cleanest one)
import json
from itertools import zip_longest
from fuzzywuzzy import fuzz
synonyms = open("synonyms.json", "r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor", "solid state", "Corei5 :1135G7 (11thGeneration)", "hard drive", "ddr 8gb", "something1",
"something2",
"something3", "HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation", "corei7:latest technology"]
vendor = []
buyer = []
for k, k2 in zip_longest(vendor_data, buyer_data):
has_matched = False
for item, value in synonyms.items():
for v in value:
if fuzz.token_set_ratio(k, v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item + " " + k)
if has_matched or k2 is None:
break
else:
has_matched = True
if fuzz.token_set_ratio(k2, v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item + " " + k2)
if has_matched or k is None:
break
else:
has_matched = True
else:
continue # match not found
break # match is found
else: # only evaluates on normal loop end
# Only something strings
# do something with the new input values
continue
vendor = list(set(vendor))
buyer = list(set(buyer))
I hope you can achieve what you want with this code. Check the docs if you don't know what a for else loop does. TLDR: the else clause executes when the loop terminates normally (not with a break). Note that I put the synonyms loop inside the data loop. This is because we can't certainly know in which synonym group the data belongs, also somethimes the vendor data entry is a processor while the buyer data is memory. Also note that I have assumed an item can't match more than 1 time. If this could be the case you would need to make a more advanced check (just make a counter and break when the counter equals 2 for example).
EDIT:
I took another look at the question and came up with maybe a better answer:
v_dict = dict()
for spec in vendor_data[:]:
for item, choices in synonyms.items():
if process.extractOne(spec, choices)[1] > 70: # don't forget to import process from fuzzywuzzy
v_dict[spec] = item
break
else:
v_dict[spec] = "Something new"
This code matches the strings to the correct type. for example {'i7 processor': 'processor', 'solid state': 'ssd', 'Corei5 :1135G7 (11thGeneration)': 'processor', 'hard drive': 'ssd', 'ddr 8gb': 'ram', 'something1': 'Something new', 'something2': 'Something new', 'something3': 'Something new', 'HT (100W) DDR4-2400': 'ram'}. You can change the "Something new" with watherver you like. You could also do: v_dict[spec] = 0 (on a match) and v_dict[spec] = 1 (on no match). You could then sort the dict ->
it = iter(v_dict.values())
print(sorted(v_dict.keys(), key=lambda x: next(it)))
Which would give the wanted results (more or less), all the recognised items will be first, and then all the unrecognised items. You could do some more advanced sorting on this dict if you want. I think this code gives you enough flexibility to reach your goal.
If I understand correctly, what you are trying to do is match keywords specified by a customer and/or vendor against a predefined database of keywords you have.
First, I would highly recommend using a reversed mapping of the synonyms, so it's faster to lookup, especially when the dataset will grow.
Second, considering the fuzzywuzzy API, it looks like you simply want the best match, so extractOne is a solid choice for that.
Now, extractOne returns the best match and a score:
>>> process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
I would split the algorithm into two:
A generic part that simply gets the best match, which should always exist (even if it's not a great one)
A filter, where you could adjust the sensitivity of the algorithm, based on different criteria of your application. This sensitivity threshold should set the minimal match quality. If you're below this threshold, just use "untagged" for the category for example.
Here is the final code, which I think is very simple and easy to understand and expand:
import json
from fuzzywuzzy import process
def load_synonyms():
with open('synonyms.json') as fin:
synonyms = json.load(fin)
# Reversing the map makes it much easier to lookup
reversed_synonyms = {}
for key, values in synonyms.items():
for value in values:
reversed_synonyms[value] = key
return reversed_synonyms
def load_vendor_data():
return [
"i7 processor",
"solid state",
"Corei5 :1135G7 (11thGeneration)",
"hard drive",
"ddr 8gb",
"something1",
"something2",
"something3",
"HT (100W) DDR4-2400"
]
def load_customer_data():
return [
"i7 processor 12 generation",
"corei7:latest technology"
]
def get_tag(keyword, synonyms):
THRESHOLD = 80
DEFAULT = 'general'
tag, score = process.extractOne(keyword, synonyms.keys())
return synonyms[tag] if score > THRESHOLD else DEFAULT
def main():
synonyms = load_synonyms()
customer_data = load_customer_data()
vendor_data = load_vendor_data()
data = customer_data + vendor_data
tags_dict = { keyword: get_tag(keyword, synonyms) for keyword in data }
print(json.dumps(tags_dict, indent=4))
if __name__ == '__main__':
main()
When running with the specified inputs, the output is:
{
"i7 processor 12 generation": "processor",
"corei7:latest technology": "processor",
"i7 processor": "processor",
"solid state": "ssd",
"Corei5 :1135G7 (11thGeneration)": "processor",
"hard drive": "hdd",
"ddr 8gb": "ram",
"something1": "general",
"something2": "general",
"something3": "general",
"HT (100W) DDR4-2400": "ram"
}
I have a PDF file which contains Lottery Tickets winners, i want to extract all win tickets according to their prizes.
PDF file
i tried this:
import re
import pdfplumber
prize_re = re.compile(r"^\d[a-z]")
cons_prize_re = re.compile(r"^Cons")
ticket1_line_re = re.compile(r"^\d[)]")
ticket2_line_re = re.compile(r"^\d{4}")
ticket3_line_re = re.compile(r"[A-Z] \d{6}")
with pdfplumber.open("./test11.pdf") as pdf:
for i in range(len(pdf.pages)):
page_text = pdf.pages[i].extract_text()
for line in page_text.split("\n"):
if prize_re.match(line) or cons_prize_re.match(line) or ticket1_line_re.match(line) or ticket2_line_re.match(line) or ticket3_line_re.search(line):
print(line)
and i got this, i don't know how to assign each ticket to its prize, also Cons prizes tickets number seems a little bit strange i don't know why (AN 867952AO 867952AP shoud be => AN 867952 AO 867952 AP...):
1st Prize Rs :7000000/- 1) AU 867952 (MANANTHAVADY)
Cons Prize-Rs :8000/- AN 867952AO 867952AP 867952 AR 867952AS 867952
AT 867952 AV 867952 AW 867952AX 867952AY 867952
AZ 867952
2nd Prize Rs :500000/- 1) AZ 499603 (ADOOR)
3rd Prize Rs :100000/- 1) AN 215264 (KOTTAYAM)
2) AO 852774 (PATTAMBI)
3) AP 953655 (KOTTAYAM)
4) AR 638904 (PAYYANUR)
5) AS 496774 (VAIKKOM)
6) AT 878990 (WAYANADU)
7) AU 703702 (PUNALUR)
8) AV 418446 (WAYANADU)
9) AW 994685 (KOZHIKKODE)
10) AX 317550 (PATTAMBI)
11) AY 854780 (CHITTUR)
12) AZ 899905 (KARUNAGAPALLY
...
instead i want to get:
[
{
"1st Prize Rs :7000000",
"tickets": [
"AU 867952"
]
},
{
"Cons Prize-Rs :8000",
"tickets": [
"AN 867952",
"AO 867952",
"AP 867952",
"AR 867952",
...
]
},
...
]
how can i achieve this ?
You could first get all the full parts from all the pages in capture groups.
Then you can after process the 3rd capture group to get the separate "tickets" and in a loop create the wanted data structure.
For the first separate groups, you can use a pattern that matches the start of every prize section, and captures all values until the next prize section.
^(\w+ Prize[-\s]Rs\s*):(\d+)/-(?:\s*\d+\))?\s*(.*(?:\n(?!\w+ Prize\b).*)*)
Regex demo
For the after processing, you can use a pattern for the ticket formats, which matches either 2 uppercase chars, space and 6 digits, or 4 or more digits followed by a whitespace boundary.
(?:[A-Z]{2} \d{6}(?!\d)|(?<!\S)\d{4,}(?!\S))
Regex demo
Example code using the pdf file from the question:
import re
import pdfplumber
import json
pattern = r"^(\w+ Prize[-\s]Rs\s*):(\d+)/-(?:\s*\d+\))?\s*(.*(?:\n(?!\w+ Prize\b).*)*)"
with pdfplumber.open("./test11.pdf") as pdf:
all_text = ""
for page in pdf.pages:
all_text += '\n' + page.extract_text()
matches = re.finditer(pattern, all_text, re.MULTILINE)
coll = []
for matchNum, match in enumerate(matches):
dct = {}
dct[match.group(1)] = match.group(2)
dct["tickets"] = re.findall(r"(?:[A-Z]{2} \d{6}(?!\d)|(?<!\S)\d{4,}(?!\S))", match.group(3))
coll.append(dct)
print(json.dumps(coll, indent=4))
Output
[
{
"1st Prize Rs ": "120000000",
"tickets": [
"XG 218582"
]
},
{
"Cons Prize-Rs ": "500000",
"tickets": [
"XA 218582",
"XB 218582",
"XC 218582",
"XD 218582",
"XE 218582"
]
},
{
"2nd Prize Rs ": "5000000",
"tickets": [
"XA 788417",
"XB 161796",
"XC 319503",
"XD 713832",
"XE 667708",
"XG 137764"
]
},
....
I have sought different articles here about searching data from a list, but nothing seems to be working right or is appropriate in what I am supposed to implement.
I have this pre-created module with over 500 list (they are strings, yes, but is considered as list when called into function; see code below) of names, city, email, etc. The following are just a chunk of it.
empRecords="""Jovita,Oles,8 S Haven St,Daytona Beach,Volusia,FL,6/14/1965,32114,386-248-4118,386-208-6976,joles#gmail.com,http://www.paganophilipgesq.com,;
Alesia,Hixenbaugh,9 Front St,Washington,District of Columbia,DC,3/3/2000,20001,202-646-7516,202-276-6826,alesia_hixenbaugh#hixenbaugh.org,http://www.kwikprint.com,;
Lai,Harabedian,1933 Packer Ave #2,Novato,Marin,CA,1/5/2000,94945,415-423-3294,415-926-6089,lai#gmail.com,http://www.buergimaddenscale.com,;
Brittni,Gillaspie,67 Rv Cent,Boise,Ada,ID,11/28/1974,83709,208-709-1235,208-206-9848,bgillaspie#gillaspie.com,http://www.innerlabel.com,;
Raylene,Kampa,2 Sw Nyberg Rd,Elkhart,Elkhart,IN,12/19/2001,46514,574-499-1454,574-330-1884,rkampa#kampa.org,http://www.hermarinc.com,;
Flo,Bookamer,89992 E 15th St,Alliance,Box Butte,NE,12/19/1957,69301,308-726-2182,308-250-6987,flo.bookamer#cox.net,http://www.simontonhoweschneiderpc.com,;
Jani,Biddy,61556 W 20th Ave,Seattle,King,WA,8/7/1966,98104,206-711-6498,206-395-6284,jbiddy#yahoo.com,http://www.warehouseofficepaperprod.com,;
Chauncey,Motley,63 E Aurora Dr,Orlando,Orange,FL,3/1/2000,32804,407-413-4842,407-557-8857,chauncey_motley#aol.com,http://www.affiliatedwithtravelodge.com
"""
a = empRecords.strip().split(";")
And I have the following code for searching:
import empData as x
def seecity():
empCitylist = list()
for ct in x.a:
empCt = ct.strip().split(",")
empCitylist.append(empCt)
t = sorted(empCitylist, key=lambda x: x[3])
for c in t:
city = (c[3])
print(city)
live_city = input("Enter city: ")
for cy in city:
if live_city in cy:
print(c[1])
# print("Name: "+ c[1] + ",", c[0], "| Current City: " + c[3])
Forgive my idiotic approach as I am new to Python. However, what I am trying to do is user will input the city, then the results should display the employee's last name, first name who are living in that city (I dunno if I made sense lol)
By the way, the code I used above doesn't return any answers. It just loops to the input.
Thank you for helping. Lovelots. <3
PS: the format of the empData is: first name, last name, address, city, country, birthday, zip, phone, and email
You can use the csv module to read easily a file with comma separated values
import csv
with open('test.csv', newline='') as csvfile:
records = list(csv.reader(csvfile))
def search(data, elem, index):
out = list()
for row in data:
if row[index] == elem:
out.append(row)
return out
#test
print(search(records, 'Orlando', 3))
Based on your original code, you can do it like this:
# Make list of list records, sorted by city
t = sorted((ct.strip().split(",") for ct in x.a), key=lambda x: x[3])
# List cities
print("Cities in DB:")
for c in t:
city = (c[3])
print("-", city)
# Define search function
def seecity():
live_city = input("Enter city: ")
for c in t:
if live_city == c[3]:
print("Name: "+ c[1] + ",", c[0], "| Current City: " + c[3])
seecity()
Then, after you understand what's going on, do as #Hoxha Alban suggested, and use the csv module.
The beauty of python lies in list comprehension.
empRecords="""Jovita,Oles,8 S Haven St,Daytona Beach,Volusia,FL,6/14/1965,32114,386-248-4118,386-208-6976,joles#gmail.com,http://www.paganophilipgesq.com,;
Alesia,Hixenbaugh,9 Front St,Washington,District of Columbia,DC,3/3/2000,20001,202-646-7516,202-276-6826,alesia_hixenbaugh#hixenbaugh.org,http://www.kwikprint.com,;
Lai,Harabedian,1933 Packer Ave #2,Novato,Marin,CA,1/5/2000,94945,415-423-3294,415-926-6089,lai#gmail.com,http://www.buergimaddenscale.com,;
Brittni,Gillaspie,67 Rv Cent,Boise,Ada,ID,11/28/1974,83709,208-709-1235,208-206-9848,bgillaspie#gillaspie.com,http://www.innerlabel.com,;
Raylene,Kampa,2 Sw Nyberg Rd,Elkhart,Elkhart,IN,12/19/2001,46514,574-499-1454,574-330-1884,rkampa#kampa.org,http://www.hermarinc.com,;
Flo,Bookamer,89992 E 15th St,Alliance,Box Butte,NE,12/19/1957,69301,308-726-2182,308-250-6987,flo.bookamer#cox.net,http://www.simontonhoweschneiderpc.com,;
Jani,Biddy,61556 W 20th Ave,Seattle,King,WA,8/7/1966,98104,206-711-6498,206-395-6284,jbiddy#yahoo.com,http://www.warehouseofficepaperprod.com,;
Chauncey,Motley,63 E Aurora Dr,Orlando,Orange,FL,3/1/2000,32804,407-413-4842,407-557-8857,chauncey_motley#aol.com,http://www.affiliatedwithtravelodge.com
"""
rows = empRecords.strip().split(";")
data = [ r.strip().split(",") for r in rows ]
then you can use any condition to filter the list, like
print ( [ "Name: " + emp[1] + "," + emp[0] + "| Current City: " + emp[3] for emp in data if emp[3] == "Washington" ] )
['Name: Hixenbaugh,Alesia| Current City: Washington']
dic = {
'key_1':['val_1','val_2'],
'key_2':['val_3','val_4'],
'key_3':['val_5','val_6']
}
info = {
'i_1':'good',
'i_2':'bad'
}
for k,v in dic.items()
print 'Jack scrd'+info[i_2]+"in both subjects"+dic[val1]+'&'+dic[val2]
I know the print code is not right but gave it here for understanding what I really wanted to do here. I want only the above similar line in printing command.
The following does this. The format command allows you to easily substitute {} with the variables of your choice in a string. When it comes to the dictionary dic[k] would render all the lists which ['val_1','val_2'] is one. So you would give dic[k][0] to get the first value, and dic[k][1] to get the second value.
for k,v in dic.items():
msg = 'Jack scrd {} in both subjects {} & {}'
print msg.format(info['i_2'], dic[k][0], dic[k][1])
# Jack scrd bad in both subjects val_1 & val_2
# Jack scrd bad in both subjects val_5 & val_6
# Jack scrd bad in both subjects val_3 & val_4
Perhaps you want to do this?
Remove the for loop
print 'Jack scrd'+info['i_2']+'in both subjects'+str(dic['key_1'][0])+"&"+str(dic['key_1'][1])
you can use .join to join all list items with a given separator. If you want to print all keys of dic you can use:
dic = {
'key_1':['val_1','val_2'],
'key_2':['val_3','val_4'],
'key_3':['val_5','val_6']
}
info = {
'i_1':'good',
'i_2':'bad'
}
for keys in dic:
# .join will join all list items for a particular key with & and store it in variable subjects
subjects = ' & '.join(dic[keys])
print "jack scored " + info['i_1'] + " in both subjects " + subjects
output:
jack scored good in both subjects val_1 & val_2
jack scored good in both subjects val_5 & val_6
jack scored good in both subjects val_3 & val_4
place_iraq = {
'shrine_1' : ['karbala','imam hussein as.'],
'shrine_2' : ['najaf', 'imam ali as.'],
'yard' : ['karbala', 'wadi-us-salam'],
'shrine_3' : ['karbala', 'abbas as.']
}
type = {
't1':'shrine',
't2': 'grave yard'
}
print 'The '+str(type['t1'])+' of '+str(place_iraq['shrine_1'][1])+' is situated in ' +\
str(place_iraq['shrine_1'][0])
print 'The '+str(type['t1'])+' of '+str(place_iraq['shrine_2'][1])+' is situated in ' +\
str(place_iraq['shrine_2'][0])
print 'The '+str(type['t1'])+' of '+str(place_iraq['shrine_3'][1])+' is situated in ' +\
str(place_iraq['shrine_3'][0])
print 'The '+str(type['t2'])+' of '+str(place_iraq['yard'][1])+' is situated in ' +\
str(place_iraq['yard'][0])
I need to create a program which removes punctuation, some specific words, duplicates and return the words left and their respective lines. I also need to keep track of the duplicates. For instance,
Python IDLE
Indexer: type in lines, finish with a . at start of line only
It is a briskly blowing wind that blows
from the north, the North of my youth.
The wind is cold too, colder than the
winds of yesteryear.
.
The index is:
brisk 1
blow 1
wind 1, 3, 4
north 2
youth 2
cold 3
yesteryear 4
The Problem: I need to keep track of the line number of the words left and also their duplicates. I'm not being able to do that.
from string import *
stopWords = [ "a", "i", "it", "am", "at", "on", "in", "to", "too", "very", \
"of", "from", "here", "even", "the", "but", "and", "is", "my", \
"them", "then", "this", "that", "than", "though", "so", "are" ]
endings = [ "es" , "ed" , "er", "ly"]
punctuation = [ ".", "," , ":" , ";" , "!" , "?" , "&" , "'" ]
unindexed_sentence = raw_input("type in lines, finish with a . at start of line only").lower()
#removing duplicates.
def unique_string(l):
ulist = []
ulist2 = []
[ulist.append(x) for x in l if x not in ulist]
[ulist2.append(x)]
global ulist2
return ulist
unindexed_sentence =' '.join(unique_string(unindexed_sentence.split()))
unindexed_sentence1 = split(unindexed_sentence,"\n")
list_unindexed = []
# splitting
i = 0
while i<len(unindexed_sentence1):
list_unindexed += [split(unindexed_sentence1[i])]
i+=1
countline = 0
i = 0
while i < len(list_unindexed):
j = 0
while j < len(list_unindexed[i]):
if list_unindexed[i][j][0] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:0]
if list_unindexed[i][j][-1] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-1] == "s":
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-2:] in endings:
list_unindexed[i][j] = list_unindexed[i][j][:-2]
if list_unindexed[i][j][-3:] == "ing":
list_unindexed[i][j] = list_unindexed[i][j][:-3]
if list_unindexed[i][j] in stopWords:
del list_unindexed[i][j]
else:
j += 1
i += 1
countline += 1
def new_line(n):
split(n,"\n")
count = 1
if n[-1] == "\n":
count += 1
return count
string1 = str(list_unindexed)
string2 = str(string1)
string2 ='\n'.join(unique_string(string2.split()))
print string2
Is it your homework?
Here some tips:
Don't do: from string import *. You don't need it.
Use data.splitlines() to get list of lines
Use enumerate() to get a index, e.g.: for i, line in enumerate(data.splitlines())
Use a dictionary for keeping track of all words. Each value could be a list or a set of line numbers
Don't remove duplicates initially. You can do this using dictionaries or sets.