Print out the character, word, and line amounts using Python - python

This is what I have so far:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
infile = open(filename)
lines = infile.readlines()
words = infile.read()
chars = infile.read()
infile.close()
print("line count:", len(lines))
print("word count:", len(words.split()))
print("character counter:", len(chars))
When executed, return the number of lines properly, but return 0 for words and character counts. Not sure why...

You can iterate through the file once and count lines, words and chars without seeking back to the beginning multiple times, which you would need to do with your approach because you exhaust the iterator when counting lines:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
lines = chars = 0
words = []
with open(filename) as infile:
for line in infile:
lines += 1
words.extend(line.split())
chars += len(line)
print("line count:", lines)
print("word count:", len(words))
print("character counter:", chars)
return len(words) > len(set(words)) # Returns True if duplicate words
Or alternatively use the side effect that the file is at the end for chars:
def stats(filename):
' prints the number of lines, words, and characters in file filename'
words = []
with open(filename) as infile:
for lines, line in enumerate(infile, 1):
words.extend(line.split())
chars = infile.tell()
print("line count:", lines)
print("word count:", len(words))
print("character counter:", chars)
return len(words) > len(set(words)) # Returns True if duplicate words

you need to go back to beginning of file with infile.seek(0) after you read the position is at the end, seek(0) resets it to the start, so that you can read again.
infile = open('data')
lines = infile.readlines()
infile.seek(0)
print(lines)
words = infile.read()
infile.seek(0)
chars = infile.read()
infile.close()
print("line count:", len(lines))
print("word count:", len(words.split()))
print("character counter:", len(chars))
Output:
line count: 2
word count: 19
character counter: 113
other way of doing it....:
from collections import Counter
from itertools import chain
infile = open('data')
lines = infile.readlines()
cnt_lines = len(lines)
words = list(chain.from_iterable([x.split() for x in lines]))
cnt_words = len(words)
cnt_chars = len([ c for word in words for c in word])
# show words frequency
print(Counter(words))

You have exhausted the iterator after you call to readlines, you can seek back to the start but really you don't need to read all the file into memory at all:
def stats(filename):
chars, words, dupes = 0, 0, False
seen = set()
with open(filename) as f:
for i, line in enumerate(f, 1):
chars += len(line)
spl = line.split()
words += len(spl)
if dupes or not seen.isdisjoint(spl):
dupes = True
elif not dupes:
seen.update(spl)
return i, chars, words, dupes
Then assign the values by unpacking:
no_lines, no_chars, no_words, has_dupes = stats("your_file")
You may want to use chars += len(line.rstrip()) if you don't want to include the line endings. The code only stores exactly the amount of data needed, using readlines, read, dicts of full data etc.. means for large files your code won't be very practical

File_Name = 'file.txt'
line_count = 0
word_count = 0
char_count = 0
with open(File_Name,'r') as fh:
# This will produce a list of lines.
# Each line of the file will be an element of the list.
data = fh.readlines()
# Count of total number for list elements == total number of lines.
line_count = len(data)
for line in data:
word_count = word_count + len(line.split())
char_count = char_count + len(line)
print('Line Count : ' , line_count )
print('Word Count : ', word_count)
print('Char Count : ', char_count)

Related

How to compare word frequencies from two text files?

How to compare word frequencies from two text files in python? For example, if a word contains in file1 and file2 both then it should be written only once but not adding their frequencies while comparing, it should be {'The': 3,5}. Here 3 is the frequency in file1 and 5 is frequency in file2. And if some words only exist in one file but not both then for that file there should be 0. Please Help
Here is what I have done so far:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
print(worddictionary)
print(worddictionary2)
Edit: Here's the more general way you would do this for any list of files (explanation in comments):
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
file_list = [f1, f2] # This would hold all your open files
num_files = len(file_list)
frequencies = {} # We'll just make one dictionary to hold the frequencies
for i, f in enumerate(file_list): # Loop over the files, keeping an index i
for line in f: # Get the lines of that file
for word in line.split(): # Get the words of that file
if not word in frequencies:
frequencies[word] = [0 for _ in range(num_files)] # make a list of 0's for any word you haven't seen yet -- one 0 for each file
frequencies[word][i] += 1 # Increment the frequency count for that word and file
print frequencies
Keeping with the code you wrote, here's how you could create a combined dictionary:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
# Create a combined dictionary
combined_dictionary = {}
all_word_set = set(worddictionary.keys()) | set(worddictionary2.keys())
for word in all_word_set:
combined_dictionary[word] = [0,0]
if word in worddictionary:
combined_dictionary[word][0] = worddictionary[word]
if word in worddictionary2:
combined_dictionary[word][1] = worddictionary2[word]
print(worddictionary)
print(worddictionary2)
print(combined_dictionary)
Edit: I misunderstood the problem, the code now works for your question.
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
wordList = {}
for line in f1.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(not word in wordList): #if the word is not already in our dictionary
wordList[word] = 0 #Add the word to the dictionary
for line in f2.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(word in wordList): #if the word is already in our dictionary
wordList[word] = wordList[word]+1 #add one to it's value
f1.close() #close files
f2.close()
f1 = open('file1.txt','r') #Have to re-open because we are at the end of the file.
#might be a n easier way of doing this
for line in f1.readlines(): #Removing keys whose values are 0
for word in line.split(): #for each word in each line
try:
if(wordList[word] == 0): #if it's value is 0
del wordList[word] #remove it from the dictionary
else:
wordList[word] = wordList[word]+1 #if it's value is not 0, add one to it for each occurrence in file1
except:
pass #we know the error was that there was no wordList[word]
f1.close()
print(wordList)
Adding first file words, if that word is in second file, add one to the value.
After that, check each word, if it's value is 0, remove it.
This can't be done by iterating over the dictionary, because it is changing size while iterating over it.
This is how you would implement it for multiple files (more complex):
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
fileList = ["file1.txt", "file2.txt"]
openList = []
for i in range(len(fileList)):
openList.append(open(fileList[i], 'r'))
fileWords = []
for i, file in enumerate(openList): #for each file
fileWords.append({}) #add a dictionary to our list
for line in file: #for each line in each file
for word in line.split(): #for each word in each line
if(word in fileWords[i]): #if the word is already in our dictionary
fileWords[i][word] += 1 #add one to it
else:
fileWords[i][word] = 1 #add it to our dictionary with value 0
for i in openList:
i.close()
for i, wL in enumerate(fileWords):
print(f"File: {fileList[i]}")
for l in wL.items():
print(l)
#print(f"File {i}\n{wL}")
You might find the following demonstration program to be a good starting point for getting the word frequencies of your files:
#! /usr/bin/env python3
import collections
import pathlib
import pprint
import re
import sys
def main():
freq = get_freq(sys.argv[0])
pprint.pprint(freq)
def get_freq(path):
if isinstance(path, str):
path = pathlib.Path(path)
return collections.Counter(
match.group() for match in re.finditer(r'\b\w+\b', path.open().read())
)
if __name__ == '__main__':
main()
In particular, you will want to use the get_freq function to get a Counter object that tells you what the word frequencies are. Your program can call the get_freq function multiple times with different file names, and you should find the Counter objects to be very similar to the dictionaries you were previously using.

Output features of a file based on its longest line

I want to write a program file_stats.py that when run on the command line, accepts a text file name as an argument and outputs the number of characters, words, lines, and the length (in characters) of the longest line in the file. Does anyone know the proper syntax to do something like this if I want the output to look like this:
Characters: 553
Words: 81
Lines: 21
Longest line: 38
Assuming your file path is a string, something like this should work
file = "pathtofile.txt"
with open(file, "r") as f:
text = f.read()
lines = text.split("\n")
longest_line = 0
for l in lines:
if len(l) > longest_line:
longest_line = len(l)
print("Longest line: {}".format(longest_line))
The whole program
n_chars = 0
n_words = 0
n_lines = 0
longest_line = 0
with open('my_text_file') as f:
lines = f.readlines()
# Find the number of Lines
n_lines = len(lines)
# Find the Longest line
longest_line = max([len(line) for line in lines])
# Find the number of Words
words = []
line_words = [line.split() for line in lines]
for line in line_words:
for word in line:
words.append(word)
n_words = len(words)
# Find the number of Characters
chars = []
line_chars = [list(word) for word in words]
for line in line_chars:
for char in line:
chars.append(char)
n_chars = len(chars)
print("Characters: ", n_chars)
print("Words: ", n_words)
print("Lines: ", n_lines)
print("Longest: ", longest_line)

python counting charaters without spaces in a file

how do you count charcters with out spaces? I am not getting the right number. The right number of num_charsx is 1761
num_words = 0
num_chars = 0
with open("C:/Python33/fire.txt",'r') as f:
for line in f:
words = line.split('\n')
num_words += len(words)
num_chars += len(line)
num_charsx = num_chars - line.count(' ')
print(num_charsx)
2064
words = line.split('\n')
num_words += len(words)
doesn't do what you think it does. In the loop
for line in f:
line is a string that ends in '\n', so line.split('\n') is a two-item list, with the first item containing all the characters of the line apart from the terminating '\n'; the second item in that list is the empty string. Example:
line = 'This is a test\n'
words = line.split('\n')
print(words, len(words))
output
['This is a test', ''] 2
So your num_words += len(words) doesn't actually count words, it just gets twice the count of the number of lines.
To get an actual list of the words in line you need
words = line.split()
Your penultimate line
num_charsx = num_chars - line.count(' ')
is outside the for loop so it subtracts the space count of the last line of the file from the total num_chars, but I assume you really want to subtract the total space count of the whole file from num_chars.
Here's a repaired version of your code.
num_words = 0
num_chars = 0
num_spaces = 0
with open(fname, 'r') as f:
for num_lines, line in enumerate(f, 1):
num_words += len(line.split())
num_chars += len(line) - 1
num_spaces += line.count(' ')
num_charsx = num_chars - num_spaces
print(num_lines, num_words, num_chars, num_spaces, num_charsx)
I've modified the line reading loop to use enumerate. That's an efficient way to get the line number and the line contents without having to maintain a separate line counter.
In num_chars += len(line) - 1 the -1 is so we don't include the terminating '\n' of each line in the char count.
Note that on Windows text file lines are (normally) terminated with '\r\n' but that terminator gets converted to '\n' when you read a file opened in text mode. So on Windows the actual byte size of the file is num_chars + 2 * num_lines, assuming the last line has a '\r\n' terminator; it may not, in which case the actual size will be 2 bytes less than that.
You may want to try splitting the lines with a ' ' instead of a '\n'. As the '\n' should pretty much being done by the for loop.
The other option if you just want a character count is you could just use the replace method to remove ' ' and then count the length of the string.
num_chars = len(line.replace(' ', ''))
You could also try this:
num_chars = 0
with open("C:/Python33/fire.txt",'r') as f:
for line in f:
num_chars += len(line.split('\n')[0])
num_charsx = num_chars - line.count(' ')
print(num_charsx)

word counter || python

I want to print the number of words in a txt file having 1-20 letter.
Tried this but it prints 20 zeroes instead. any idea?
edit - in the end the program should plot 20 numbers, each one is the number of words in the file containing 1-20 letters.
fin = open('words.txt')
for i in range(20):
counter = 0
for line in fin:
word = line.strip()
if len(word) == i:
counter = counter + 1
print counter,
EDIT
To produce individual counts for each word length you can use a collections.Counter:
from collections import Counter
def word_lengths(f):
for line in f:
for word in line.split(): # does not ignore punctuation
yield len(word)
with open('words.txt') as fin:
counts = Counter(length for length in word_lengths(fin) if length <= 20)
This uses a generator to read the file and produce a sequence of word lengths. The filtered word lengths are fed into a Counter. You could perform the length filtering on the Counter instead.
If you want to ignore punctuation you could look at using str.translate() to remove unwanted characters, or possibly re.split(r'\W+', line) instead of line.split().
Try it like this:
with open('words.txt') as fin:
counter = 0
for line in fin:
for word in line.split():
if len(word) <= 20:
counter = counter + 1
print counter,
This could be simplified to:
with open('words.txt') as fin:
counter = sum([1 for line in fin
for word in line.split() if len(word) <= 20])
but that's playing code golf.
You can also use a collections.Counter if it is practical to read the entire file into memory:
from collections import Counter
with open('words.txt') as fin:
c = Counter(fin.read().split())
counter = sum(c[k] for k in c if len(k) <= 20)
And no doubt there are many other ways to do it. None of the above expect or handle punctuation.
It should be like this,counter shouldn't be in for loop,and you could use len() method to get the length of words:
with open("test") as f:
counter = 0
for line in f:
for word in line.split():
if len(word)<=20:
counter+=1
print counter
Or my way:
import re
with open("file") as f:
print len(filter(lambda x:len(x)<20,re.split('\n| ', f.read())))
Hope this helps.
using regular expressions
import re
REGEX = r"(\b\S{1,20}\b)"
finder = re.compile(REGEX)
with open("words.txt") as out:
data = out.read()
matches = re.findall(finder, data)
lst = [0 for _ in range(20)]
for m in matches:
lst[len(m)] += 1
print(lst)

I have a txt file. How can I take dictionary key values and print the line of text they appear in?

I have a txt file. I have written code that finds the unique words and the number of times each word appears in that file. I now need to figure out how to print the lines that those words apear in as well. How can I go about doing this?
Here is a sample output:
Analyze what file: itsy_bitsy_spider.txt
Concordance for file itsy_bitsy_spider.txt
itsy : Total Count: 2
Line:1: The ITSY Bitsy spider crawled up the water spout
Line:4: and the ITSY Bitsy spider went up the spout again
#this function will get just the unique words without the stop words.
def openFiles(openFile):
for i in openFile:
i = i.strip()
linelist.append(i)
b = i.lower()
thislist = b.split()
for a in thislist:
if a in stopwords:
continue
else:
wordlist.append(a)
#print wordlist
#this dictionary is used to count the number of times each stop
countdict = {}
def countWords(this_list):
for word in this_list:
depunct = word.strip(punctuation)
if depunct in countdict:
countdict[depunct] += 1
else:
countdict[depunct] = 1
from collections import defaultdict
target = 'itsy'
word_summary = defaultdict(list)
with open('itsy.txt', 'r') as f:
lines = f.readlines()
for idx, line in enumerate(lines):
words = [w.strip().lower() for w in line.split()]
for word in words:
word_summary[word].append(idx)
unique_words = len(word_summary.keys())
target_occurence = len(word_summary[target])
line_nums = set(word_summary[target])
print "There are %s unique words." % unique_words
print "There are %s occurences of '%s'" % (target_occurence, target)
print "'%s' is found on lines %s" % (target, ', '.join([str(i+1) for i in line_nums]))
If you parsed the input text file line by line, you could maintain another dictionary that is a word -> List<Line> mapping. ie for each word in a line, you add an entry. Might look something like the following. Bearing in mind I'm not very familiar with python, so there may be syntactic shortcuts I've missed.
eg
countdict = {}
linedict = {}
for line in text_file:
for word in line:
depunct = word.strip(punctuation)
if depunct in countdict:
countdict[depunct] += 1
else:
countdict[depunct] = 1
# add entry for word in the line dict if not there already
if depunct not in linedict:
linedict[depunct] = []
# now add the word -> line entry
linedict[depunct].append(line)
One modification you will probably need to make is to prevent duplicates being added to the linedict if a word appears twice in the line.
The above code assumes that you only want to read the text file once.
openFile = open("test.txt", "r")
words = {}
for line in openFile.readlines():
for word in line.strip().lower().split():
wordDict = words.setdefault(word, { 'count': 0, 'line': set() })
wordDict['count'] += 1
wordDict['line'].add(line)
openFile.close()
print words

Categories