Total number of lines and words in a file - python

I have an exercise that is asking to calculate the number of lines and words in an email file, excluding the subject line.
I can get the total number of lines and words with the following code:
file = "email.txt"
num_lines = 0
num_words = 0
with open(file, 'r') as f:
for line in f:
words = line.split()
if not line.startswith ('Subject'):
num_lines += 1
num_words += len(words)
print(num_lines)
print(num_words)
I would like to define a function to get the same information however, the second function for the word count is not return the desired value.
textFile = "email.txt"
def count_lines():
with open (textFile, 'r') as file:
num_lines = 0
for line in file:
words = line.split()
if not line.startswith ('Subject'):
num_lines = num_lines + 1
return num_lines
def count_words():
with open (textFile, 'r') as file:
num_words = 0
for words in file:
words = line.split()
if not line.startswith ('Subject'):
num_words = num_words + 1
return num_words
print(count_lines())
print(count_words())

I would suggest you another solution, using list comprehension:
with open(textFile, 'r') as f:
words_per_line = [len(line.split()) for line in f.readlines() if not line.startswith('Subject')]
total_lines = len(words_per_line)
total_words = sum(words_per_line)
Where words_per_line contains number of words per line in your file so if you count it (len) you will get the number of lines and if you sum it, you will get the total number of words.

Related

Count words in multiple files and show the count and in how many files it appeared

I have a path with 500 files and i need to count the words in all of them to show the word count and in how many files that word was found, i tried to use dict() and a variable for counting the files but only managed to count the words:
counts = dict()
...
for filename in fileList:
file=open(filename, 'r')
for line in file:
words += line.split()
for word in words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
...
for item in counts.items():
writer=open("Output.txt","a", encoding="utf8")
writer.write("{}".format(item)+"\n")
writer.close()
I'm a python newbie, the expected output is a file like this:
( word ; count of that word ; # of files where that word appeared )
This is untested, but this shows the theory:
from collections import Counter
files = Counter()
words = Counter()
for fn in list_of_files:
thisfile = set()
for word in open(fn).read().split():
words[word] += 1
thisfile.add( word )
for word in thisfile:
files[word] += 1
writer = open("Output.txt", "w", encoding="utf8")
for word in files.keys():
print( f"{word};{words[word]};{files[word]}", file=writer)

How to compare word frequencies from two text files?

How to compare word frequencies from two text files in python? For example, if a word contains in file1 and file2 both then it should be written only once but not adding their frequencies while comparing, it should be {'The': 3,5}. Here 3 is the frequency in file1 and 5 is frequency in file2. And if some words only exist in one file but not both then for that file there should be 0. Please Help
Here is what I have done so far:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
print(worddictionary)
print(worddictionary2)
Edit: Here's the more general way you would do this for any list of files (explanation in comments):
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
file_list = [f1, f2] # This would hold all your open files
num_files = len(file_list)
frequencies = {} # We'll just make one dictionary to hold the frequencies
for i, f in enumerate(file_list): # Loop over the files, keeping an index i
for line in f: # Get the lines of that file
for word in line.split(): # Get the words of that file
if not word in frequencies:
frequencies[word] = [0 for _ in range(num_files)] # make a list of 0's for any word you haven't seen yet -- one 0 for each file
frequencies[word][i] += 1 # Increment the frequency count for that word and file
print frequencies
Keeping with the code you wrote, here's how you could create a combined dictionary:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
# Create a combined dictionary
combined_dictionary = {}
all_word_set = set(worddictionary.keys()) | set(worddictionary2.keys())
for word in all_word_set:
combined_dictionary[word] = [0,0]
if word in worddictionary:
combined_dictionary[word][0] = worddictionary[word]
if word in worddictionary2:
combined_dictionary[word][1] = worddictionary2[word]
print(worddictionary)
print(worddictionary2)
print(combined_dictionary)
Edit: I misunderstood the problem, the code now works for your question.
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
wordList = {}
for line in f1.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(not word in wordList): #if the word is not already in our dictionary
wordList[word] = 0 #Add the word to the dictionary
for line in f2.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(word in wordList): #if the word is already in our dictionary
wordList[word] = wordList[word]+1 #add one to it's value
f1.close() #close files
f2.close()
f1 = open('file1.txt','r') #Have to re-open because we are at the end of the file.
#might be a n easier way of doing this
for line in f1.readlines(): #Removing keys whose values are 0
for word in line.split(): #for each word in each line
try:
if(wordList[word] == 0): #if it's value is 0
del wordList[word] #remove it from the dictionary
else:
wordList[word] = wordList[word]+1 #if it's value is not 0, add one to it for each occurrence in file1
except:
pass #we know the error was that there was no wordList[word]
f1.close()
print(wordList)
Adding first file words, if that word is in second file, add one to the value.
After that, check each word, if it's value is 0, remove it.
This can't be done by iterating over the dictionary, because it is changing size while iterating over it.
This is how you would implement it for multiple files (more complex):
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
fileList = ["file1.txt", "file2.txt"]
openList = []
for i in range(len(fileList)):
openList.append(open(fileList[i], 'r'))
fileWords = []
for i, file in enumerate(openList): #for each file
fileWords.append({}) #add a dictionary to our list
for line in file: #for each line in each file
for word in line.split(): #for each word in each line
if(word in fileWords[i]): #if the word is already in our dictionary
fileWords[i][word] += 1 #add one to it
else:
fileWords[i][word] = 1 #add it to our dictionary with value 0
for i in openList:
i.close()
for i, wL in enumerate(fileWords):
print(f"File: {fileList[i]}")
for l in wL.items():
print(l)
#print(f"File {i}\n{wL}")
You might find the following demonstration program to be a good starting point for getting the word frequencies of your files:
#! /usr/bin/env python3
import collections
import pathlib
import pprint
import re
import sys
def main():
freq = get_freq(sys.argv[0])
pprint.pprint(freq)
def get_freq(path):
if isinstance(path, str):
path = pathlib.Path(path)
return collections.Counter(
match.group() for match in re.finditer(r'\b\w+\b', path.open().read())
)
if __name__ == '__main__':
main()
In particular, you will want to use the get_freq function to get a Counter object that tells you what the word frequencies are. Your program can call the get_freq function multiple times with different file names, and you should find the Counter objects to be very similar to the dictionaries you were previously using.

Output features of a file based on its longest line

I want to write a program file_stats.py that when run on the command line, accepts a text file name as an argument and outputs the number of characters, words, lines, and the length (in characters) of the longest line in the file. Does anyone know the proper syntax to do something like this if I want the output to look like this:
Characters: 553
Words: 81
Lines: 21
Longest line: 38
Assuming your file path is a string, something like this should work
file = "pathtofile.txt"
with open(file, "r") as f:
text = f.read()
lines = text.split("\n")
longest_line = 0
for l in lines:
if len(l) > longest_line:
longest_line = len(l)
print("Longest line: {}".format(longest_line))
The whole program
n_chars = 0
n_words = 0
n_lines = 0
longest_line = 0
with open('my_text_file') as f:
lines = f.readlines()
# Find the number of Lines
n_lines = len(lines)
# Find the Longest line
longest_line = max([len(line) for line in lines])
# Find the number of Words
words = []
line_words = [line.split() for line in lines]
for line in line_words:
for word in line:
words.append(word)
n_words = len(words)
# Find the number of Characters
chars = []
line_chars = [list(word) for word in words]
for line in line_chars:
for char in line:
chars.append(char)
n_chars = len(chars)
print("Characters: ", n_chars)
print("Words: ", n_words)
print("Lines: ", n_lines)
print("Longest: ", longest_line)

Print out the character, word, and line amounts using Python

This is what I have so far:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
infile = open(filename)
lines = infile.readlines()
words = infile.read()
chars = infile.read()
infile.close()
print("line count:", len(lines))
print("word count:", len(words.split()))
print("character counter:", len(chars))
When executed, return the number of lines properly, but return 0 for words and character counts. Not sure why...
You can iterate through the file once and count lines, words and chars without seeking back to the beginning multiple times, which you would need to do with your approach because you exhaust the iterator when counting lines:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
lines = chars = 0
words = []
with open(filename) as infile:
for line in infile:
lines += 1
words.extend(line.split())
chars += len(line)
print("line count:", lines)
print("word count:", len(words))
print("character counter:", chars)
return len(words) > len(set(words)) # Returns True if duplicate words
Or alternatively use the side effect that the file is at the end for chars:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
words = []
with open(filename) as infile:
for lines, line in enumerate(infile, 1):
words.extend(line.split())
chars = infile.tell()
print("line count:", lines)
print("word count:", len(words))
print("character counter:", chars)
return len(words) > len(set(words)) # Returns True if duplicate words
you need to go back to beginning of file with infile.seek(0) after you read the position is at the end, seek(0) resets it to the start, so that you can read again.
infile = open('data')
lines = infile.readlines()
infile.seek(0)
print(lines)
words = infile.read()
infile.seek(0)
chars = infile.read()
infile.close()
print("line count:", len(lines))
print("word count:", len(words.split()))
print("character counter:", len(chars))
Output:
line count: 2
word count: 19
character counter: 113
other way of doing it....:
from collections import Counter
from itertools import chain
infile = open('data')
lines = infile.readlines()
cnt_lines = len(lines)
words = list(chain.from_iterable([x.split() for x in lines]))
cnt_words = len(words)
cnt_chars = len([ c for word in words for c in word])
# show words frequency
print(Counter(words))
You have exhausted the iterator after you call to readlines, you can seek back to the start but really you don't need to read all the file into memory at all:
def stats(filename):
chars, words, dupes = 0, 0, False
seen = set()
with open(filename) as f:
for i, line in enumerate(f, 1):
chars += len(line)
spl = line.split()
words += len(spl)
if dupes or not seen.isdisjoint(spl):
dupes = True
elif not dupes:
seen.update(spl)
return i, chars, words, dupes
Then assign the values by unpacking:
no_lines, no_chars, no_words, has_dupes = stats("your_file")
You may want to use chars += len(line.rstrip()) if you don't want to include the line endings. The code only stores exactly the amount of data needed, using readlines, read, dicts of full data etc.. means for large files your code won't be very practical
File_Name = 'file.txt'
line_count = 0
word_count = 0
char_count = 0
with open(File_Name,'r') as fh:
# This will produce a list of lines.
# Each line of the file will be an element of the list.
data = fh.readlines()
# Count of total number for list elements == total number of lines.
line_count = len(data)
for line in data:
word_count = word_count + len(line.split())
char_count = char_count + len(line)
print('Line Count : ' , line_count )
print('Word Count : ', word_count)
print('Char Count : ', char_count)

Python Count not resetting?

I'm trying to insert an increment after the occurance of ~||~ in my .txt. I have this working, however I want to split it up, so after each semicolon, it starts back over at 1.
So Far I have the following, which does everything except split up at semicolons.
inputfile = "output2.txt"
outputfile = "/output3.txt"
f = open(inputfile, "r")
words = f.read().split('~||~')
f.close()
count = 1
for i in range(len(words)):
if ';' in words [i]:
count = 1
words[i] += "~||~" + str(count)
count = count + 1
f2 = open(outputfile, "w")
f2.write("".join(words))
Why not first split the file based on the semicolon, then in each segment count the occurences of '~||~'.
import re
count = 0
with open(inputfile) as f:
semicolon_separated_chunks = f.read().split(';')
count = len(re.findall('~||~', semicolon_separated_chunks))
# if file text is 'hello there ~||~ what is that; what ~||~ do you ~|| mean; nevermind ~||~'
# then count = 4
Instead of resetting the counter the way you are now, you could do the initial split on ;, and then split the substrings on ~||~. You'd have to store your words another way, since you're no longer doing words = f.read().split('~||~'), but it's safer to make an entirely new list anyway.
inputfile = "output2.txt"
outputfile = "/output3.txt"
all_words = []
f = open(inputfile, "r")
lines = f.read().split(';')
f.close()
for line in lines:
count = 1
words = line.split('~||~')
for word in words:
all_words.append(word + "~||~" + str(count))
count += 1
f2 = open(outputfile, "w")
f2.write("".join(all_words))
See if this works for you. You also may want to put some strategically-placed newlines in there, to make the output more readable.

Categories