How to match value in enumeration to a keyword? - python

I want to write a keyword-in-context script in which I first read a text file as an enumerated list and then return a given keyword and the five next words.
I saw that similar questions were asked for C# and I found solutions for the enum module in Python, but I hope there is a solution for just using the enumerate() function.
This is what I have got so far:
# Find keywords in context
import string
# open input txt file from local path
with open('C:\\Users\\somefile.txt', 'r', encoding='utf-8', errors='ignore') as f: # open file
data1=f.read() # read content of file as string
data2=data1.translate(str.maketrans('', '', string.punctuation)).lower() # remove punctuation
data3=" ".join(data2.split()) # remove additional whitespace from text
indata=list(data3.split()) # convert string to list
print(indata[:4])
searchterms=["text", "book", "history"]
def wordsafter(keyword, source):
for i, val in enumerate(source):
if val == keyword: # cannot access the enumeration value here
return str(source[i+5]) # intend to show searchterm and subsequent five words
else:
continue
for s in searchterms: # iterate through searchterms
print(s)
wordsafter(s, indata)
print("done")
I was hoping I could simply access the value of the enumeration like I did here, but that does not seem to be the case.

With credits to #jasonharper, your improved code:
import string
def wordsafter(keyword, source):
for i, val in enumerate(source):
if val == keyword:
return ' '.join(source[i:i + 5]) # intend to show searchterm and subsequent five words
# wordsafter() for all instances
def wordsafter(keyword, source):
instances = []
for i, val in enumerate(source):
if val == keyword:
instances.append(' '.join(source[i:i + 5]))
return instances
# open input txt file from local path
with open('README.md', 'r', encoding='utf-8', errors='ignore') as f: # open file
data1 = f.read() # read content of file as string
data2 = data1.translate(str.maketrans('', '', string.punctuation)).lower() # remove punctuation
data3 = " ".join(data2.split()) # remove additional whitespace from text
indata = list(data3.split()) # convert string to list
searchterms = ["this", "book", "history"]
for string in searchterms: # iterate through searchterms
result = wordsafter(string, indata)
if result:
print(result)

Related

How to read a value of file separate by tabs in Python?

I have a text file with this format
ConfigFile 1.1
;
; Version: 4.0.32.1
; Date="2021/04/08" Time="11:54:46" UTC="8"
;
Name
John Legend
Type
Student
Number
s1054520
I would like to get the value of Name or Type or Number
How do I get it?
I tried with this method, but it does not solve my problem.
import re
f = open("Data.txt", "r")
file = f.read()
Name = re.findall("Name", file)
print(Name)
My expectation output is John Legend
Anyone can help me please. I really appreciated. Thank you
First of all re.findall is used to search for “all” occurrences that match a given pattern. So in your case. you are finding every "Name" in the file. Because that's what you are looking for.
On the other hand, the computer will not know the "John Legend" is the name. it will only know that's the line after the word "Name".
In your case I will suggest you can check this link.
Find the "Name"'s line number
Read the next line
Get the name without the white space
If there is more than 1 Name. this will work as well
the final code is like this
def search_string_in_file(file_name, string_to_search):
"""Search for the given string in file and return lines containing that string,
along with line numbers"""
line_number = 0
list_of_results = []
# Open the file in read only mode
with open(file_name, 'r') as read_obj:
# Read all lines in the file one by one
for line in read_obj:
# For each line, check if line contains the string
line_number += 1
if string_to_search in line:
# If yes, then add the line number & line as a tuple in the list
list_of_results.append((line_number, line.rstrip()))
# Return list of tuples containing line numbers and lines where string is found
return list_of_results
file = open('Data.txt')
content = file.readlines()
matched_lines = search_string_in_file('Data.txt', 'Name')
print('Total Matched lines : ', len(matched_lines))
for i in matched_lines:
print(content[i[0]].strip())
Here I'm going through each line and when I encounter Name I will add the next line (you can directly print too) to the result list:
import re
def print_hi(name):
result = []
regexp = re.compile(r'Name*')
gotname = False;
with open('test.txt') as f:
for line in f:
if gotname:
result.append(line.strip())
gotname = False
match = regexp.match(line)
if match:
gotname = True
print(result)
if __name__ == '__main__':
print_hi('test')
Assuming those label lines are in the sequence found in the file you
can simply scan for them:
labelList = ["Name","Type","Number"]
captures = dict()
with open("Data.txt","rt") as f:
for label in labelList:
while not f.readline().startswith(label):
pass
captures[label] = f.readline().strip()
for label in labelList:
print(f"{label} : {captures[label]}")
I wouldn't use a regex, but rather make a parser for the file type. The rules might be:
The first line can be ignored
Any lines that start with ; can be ignored.
Every line with no leading whitespace is a key
Every line with leading whitespace is a value belonging to the last
key
I'd start with a generator that can return to you any unignored line:
def read_data_lines(filename):
with open(filename, "r") as f:
# skip the first line
f.readline()
# read until no more lines
while line := f.readline():
# skip lines that start with ;
if not line.startswith(";"):
yield line
Then fill up a dict by following rules 3 and 4:
def parse_data_file(filename):
data = {}
key = None
for line in read_data_lines(filename):
# No starting whitespace makes this a key
if not line.startswith(" "):
key = line.strip()
# Starting whitespace makes this a value for the last key
else:
data[key] = line.strip()
return data
Now at this point you can parse the file and print whatever key you want:
data = parse_data_file("Data.txt")
print(data["Name"])

Problem with using a text file to create a dictionary that has word length as its key and the actual word itself as the value in Python

I'm currently a beginner in Python taking an introductory course for Python and I'm having trouble with creating a hangman game in which we derive our words from a text file that has each word printed on a new line, where we then in a function choose a word at random based on the word length indicated by the user. I'm not sure how we are supposed to do that, I've uploaded my current code, the problem is when I print out the dictionary only the words from the text file actually get printed out, I'm not sure why the dictionary with the keys and values aren't getting printed out... I'm also not sure why my professor wants us to use a try and except in this function and how I'm supposed to use max_size.
Here's what I've currently done
def import_dictionary (dictionary_file):
dictionary = {}
max_size = 12
with open ('dictionary.txt', 'a+') as dictionary:
dictionary_file = dictionary.read().split()
for word in dictionary_file:
dictionary[len(word)] = word
return dictionary
The function I'm using to print out
def print_dictionary (dictionary):
max_size = 12
with open('dictionary.txt', 'r') as dictionary:
print(dictionary.read())
Try this .
from collections import defaultdict
import random
def read_text_file():
words = defaultdict(list)
with open("file.txt","r") as f:
text_file = f.read()
text_file = text_file.split("\n")
for wrd in text_file:
words[len(wrd)].append(wrd)
return words
def main():
user_length = int(input())
words = read_text_file()
shuffle_words = random.sample(words[user_length])
print(shuffle_words[0])
Try the following:
def import_dictionary(dictionary_file):
dictionary = {}
max_size = 12
with open(dictionary_file, 'r') as f:
words = f.read().split('\n') # each word is on new line so split on newline: '\n'
for word in words:
length = len(word)
if length > max_size: # If word too long, ignore it
continue
elif dictionary.get(length) is not None:
dictionary[length].append(word) # If dict already has entry for word length, append word.
else:
dictionary[length] = [word] # Otherwise create entry
return dictionary

Changing list to string to remove characters

I have a file that I am trying to do a word frequency list on, but I'm having trouble with the list and string aspects. I changed my file to a string to remove numbers from the file, but that ends up messing up the tokenization. The expected output is a word count of the file I am opening excluding numbers, but what I get is the following:
Counter({'<_io.TextIOWrapper': 1, "name='german/test/polarity/negative/neg_word_list.txt'": 1, "mode='r'": 1, "encoding='cp'>": 1})
done
Here's the code:
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', str(f))
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
this ended up working, thank you to Rakesh
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', f.read())
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
Reading further i've noticed you didn't "read" the file, you've just opened it.
if you print only opening the file:
f = open("german/test/polarity/negative/neg_word_list.txt")
print(f)
You'll notice it will tell you what the object is, "io.TextIOWrapper". So you need to read it:
f_path = open("german/test/polarity/negative/neg_word_list.txt")
f = f_path.read()
f_path.close() # don't forget to do this to clear stuff
print(f)
# >>> what's really inside the file
or another way to do this without the "close()":
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
f = r.read()
It's possible that by doing that it won't be in a list, but a plain text file, so you could iterate each line:
list_of_lines = []
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
# read each line and append to list
for line in r:
list_of_lines.append(line)

Extracting data from a file using regular expressions and storing in a list to be compiled into a dictionary- python

I've been trying to extract both the species name and sequence from a file as depicted below in order to compile a dictionary with the key corresponding to the species name (FOX2_MOUSE for example) and the value corresponding to the Amino Acid sequence.
Sample fasta file:
>sp|P58463|FOXP2_MOUSE
MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVSTVELL
HLQQQQALQAARQLLLQQQTSGLKSPKSSEKQRPLQVPVSVAMMTPQVIT
PQQMQQILQQQVLSPQQLQALLQQQQAVMLQQQQLQEFYKKQQEQLHLQL
LQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ-HPGKQAKE
QQQQQQQQQ-LAAQQLVFQQQLLQMQQLQQQQHLLSLQRQGLISIPPGQA
ALPVQSLPQAGLSPAEIQQLWKEVTGVHSMEDNGIKHGGLDLTTNNSSST
TSSTTSKASPPITHHSIVNGQSSVLNARRDSSSHEETGASHTLYGHGVCK
>sp|Q8MJ98|FOXP2_PONPY
MMQESVTETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVSTVELL
HLQQQQALQAARQLLLQQQTSGLKSPKSSDKQRPLQVPVSVAMMTPQVIT
PQQMQQILQQQVLSPQQLQALLQQQQAVMLQQQQLQEFYKKQQEQLHLQL
LQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ--HPGKQAKE
QQQQQQQQQ-LAAQQLVFQQQLLQMQQLQQQQHLLSLQRQGLISIPPGQA
ALPVQSLPQAGLSPAEIQQLWKEVTGVHSMEDNGIKHGGLDLTTNNSSST
TSSTTSKASPPITHHSIVNGQSSVLNARRDSSSHEETGASHTLYGHGVCK
I've tried using my code below:
import re
InFileName = "foxp2.fasta"
InFile = open(InFileName, 'r')
Species = []
Sequence = []
reg = re.compile('FOXP2_\w+')
for Line in InFile:
Species += reg.findall(Line)
print Species
reg = re.compile('(^\w+)')
for Line in Infile:
Sequence += reg.findall(Line)
print Sequence
dictionary = dict(zip(Species, Sequence))
InFile.close()
However, my output for my lists are:
[FOX2_MOUSE, FOXP2_PONPY]
[]
Why is my second list empty? Are you not allowed to use re.compile() twice? Any suggestions on how to circumvent my problem?
Thank you,
Christy
If you want to read a file twice, you have to seek back to the beginning.
InFile.seek(0)
You can do it in a single pass, and without regular expressions:
def load_fasta(filename):
data = {}
species = ""
sequence = []
with open(filename) as inf:
for line in inf:
line = line.strip()
if line.startswith(";"): # is comment?
# skip it
pass
elif line.startswith(">"): # start of new record?
# save previous record (if any)
if species and sequence:
data[species] = "".join(sequence)
species = line.split("|")[2]
sequence = []
else: # continuation of previous record
sequence.append(line)
# end of file - finish storing last record
if species and sequence:
data[species] = "".join(sequence)
return data
data = load_fasta("foxp2.fasta")
On your given file, this produces data ==
{
'FOXP2_PONPY': 'MMQESVTETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVSTVELLHLQQQQALQAARQLLLQQQTSGLKSPKSSDKQRPLQVPVSVAMMTPQVITPQQMQQILQQQVLSPQQLQALLQQQQAVMLQQQQLQEFYKKQQEQLHLQLLQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ--HPGKQAKEQQQQQQQQQ-LAAQQLVFQQQLLQMQQLQQQQHLLSLQRQGLISIPPGQAALPVQSLPQAGLSPAEIQQLWKEVTGVHSMEDNGIKHGGLDLTTNNSSSTTSSTTSKASPPITHHSIVNGQSSVLNARRDSSSHEETGASHTLYGHGVCK',
'FOXP2_MOUSE': 'MMQESATETISNSSMNQNGMSTLSSQLDAGSRDGRSSGDTSSEVSTVELLHLQQQQALQAARQLLLQQQTSGLKSPKSSEKQRPLQVPVSVAMMTPQVITPQQMQQILQQQVLSPQQLQALLQQQQAVMLQQQQLQEFYKKQQEQLHLQLLQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ-HPGKQAKEQQQQQQQQQ-LAAQQLVFQQQLLQMQQLQQQQHLLSLQRQGLISIPPGQAALPVQSLPQAGLSPAEIQQLWKEVTGVHSMEDNGIKHGGLDLTTNNSSSTTSSTTSKASPPITHHSIVNGQSSVLNARRDSSSHEETGASHTLYGHGVCK'
}
You could also do this in a single pass with a multiline regex:
import re
reg = re.compile('(FOXP2_\w+)\n(^[\w\n-]+)', re.MULTILINE)
with open("foxp2.fasta", 'r') as file:
data = dict(reg.findall(file.read()))
The downside is that you have to read the whole file in at once. Whether this is a problem depends on likely file sizes.

String of lists (probably) to a csv file python

Following up my previous question, because I couldn't get a satisfactory answer. Now I have data like this, don't know what it exactly is
["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]
I'd like my final output to be written to a csv file like below. How can I achieve this?
A ,B ,C
a1,a2 ,b1 ,c1
a2,a4 ,b3 ,ct
Assuming that ["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"] is one long string as the original post seems to imply, ie:
"""["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
then the following code should work:
# ORIGINAL STRING
s = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
# GET RID OF UNNECESSARY CHARACTERS FOR OUR CSV
s = s.replace("][", "--") # temporary chars to help split into lines later on
s = s.replace("[", "")
s = s.replace("]", "")
s = s.replace("\'", "")
s = s.replace("\"", "")
# SPLIT UP INTO A LIST OF LINES OF TEXT
lines = s.split("--")
# WRITE EACH LINE IN TURN TO A CSV FILE
with open("myFile.csv", mode = "w") as textFile:
# mode = w to override any other contents of an existing file, or
# create a new one.
# mode = a To append to an exising file
for line in lines:
textFile.write(line + str("\n"))
An alternative way, again assuming that the data is encoded as one long string:
import ast
# ORIGINAL STRING
s = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
# PARSE INTO A LIST OF LISTS WITH STRING ELEMENTS
s2 = s.replace("][", "],[")
s2 = ast.literal_eval(s2)
s2 = [ast.literal_eval(s2[x][0]) for x in range(len(s2))]
# WRITE EACH LIST AS A LINE IN THE CSV FILE
with open("myFile.csv", mode = "w") as textFile:
# mode = w to override any other contents of an existing file, or
# create a new one.
# mode = a To append to an exising file
for i in range(len(s2)):
line = ",".join(s2[i])
textFile.write(line + str("\n"))
Since the given input won't be accepted by any inbuilt data structure, you need to convert it either into a string or a list of lists. Assuming your input as a string in the following. Also, you can modify the formatting as per your requirement.
#!/usr/bin/python
from ast import literal_eval
def csv(li):
file_handle = open("test.csv", "w")
#stripping the outer double_quotes and splitting the list by commas
for outer in li:
temp = outer[0].strip("'")
temp = temp.split("',")
value = ""
#bulding a formatted string(change this as per your requirement
for inner in temp:
value += '{0: <10}'.format(inner.strip("'")) + '{0: >10}'.format(",")
value = value.strip(", ")
#writing the built string into the file
file_handle.write(value + "\n")
file_handle.close()
#assuming your input as string
def main():
li_str = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
li = []
start_pos, end_pos = 0, -1
#break each into a new list and appending it to li
while(start_pos != -1):
start_pos = li_str.find("[", end_pos+1)
if start_pos == -1:
break
end_pos = li_str.find("]", start_pos+1)
li.append(literal_eval(li_str[start_pos:end_pos+1]))
#li now conatins a list of lists i.e. same as the input
csv(li)
if __name__=="__main__":
main()

Categories