How to count the characters from the csv? - python

My CSV have below data
['value']
['abcd']
['def abc']
I want to count each characters in descending order of value, value is the header in the csv file. I have wrote one script below. Is there any better script than this?
from csv import DictReader
with open("name.csv") as f:
a1 = [row["value"] for row in DictReader(f)]
#a1
from collections import Counter
counts = Counter()
for line in a1:
counts.update(list((line)))
x=dict(counts)
from collections import defaultdict
d = defaultdict(int)
for w in sorted(x, key=x.get, reverse=True):
print (w, x[w])

from collections import defaultdict
path = "name.csv"
d_list = defaultdict(int)
with open(path, 'r') as fl:
for word in fl:
for ch in word:
#if word[0] == ch:
dd[ch] += 1
del d_list['\n']
del d_list[' ']
#print (d_list)
dd = sorted(d_list.items(), key=lambda v:v[1], reverse=True)
#dd_lex = sorted(dd, key = lambda k: (-k[1],k[0]))
for el in dd:
print (el[0] + ' '+ str(el[1]))

Related

dictionaries and files: when I use defaultdict the normal dict doesn't work and vice verse. Why?

from collections import defaultdict
d = {}
with open("sample.log") as samp:
for line in samp:
if line.split():
parts = line.split()
# 0 because if it's not found, the count should be 0
d[parts[2]] = d.get(parts[2], 0) + 1
print(d)
dd = defaultdict(int)
for line in samp:
parts = line.strip().split()
if line.split():
dd[parts[2]] += 1
print(dd)
the link for the l: https://drive.google.com/file/d/1o1ZSvJDA7n6diVNeJztQMMecBUgQUQu9/view?usp=sharing

Sort word frequencies by descending order of frequencies

I have a text file that has word frequencies in the format:
word<space>freq
where freq is a number. I want to sort the file such as the frequencies are in descending order. For that, I have tried the following:
Read the file into a dictionary:
kvp = {}
d = {}
with open("/home/melvyn/word_freq.txt") as myfile:
for line in myfile:
word, freq = line.partition(" ")[::2]
kvp[word.strip()] = int(freq)
Sort the dictionary by values:
d = sorted(kvp.items(), key=lambda x:x[1])
Write the sorted dictionary into another text file:
with open('/home/melvyn/word_freq_sorted.txt', 'w') as f:
json.dump(d, f)
I have the following questions:
1. Sorting is not happening. Why?
2. How can I add new line between every key-value pair while doing a json.dump? Is there a cleaner way to write the dictionary contents into the text file?
Instead of json.dump, try writing to the file with file.write, formatting the strings as needed.
import json
kvp = {}
d = {}
with open("a.txt", "r") as f:
for line in f:
word, freq = line.partition(" ")[::2]
kvp[word.strip()] = int(freq)
d = sorted(kvp.items(), key=lambda x:x[1])
with open("b.txt", "w") as f:
for i, v in d:
f.write(str(i) + " " + str(v) + "\n")

count instances of duplicates in a colum of a .csv file

import re, csv
import os,shutil
import io,json, collections
from collections import Counter, defaultdict,deque
sn=0 #1st column
p_f=1 #2nd column
reader = csv.reader(open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv", "r"), delimiter='\t')
f= csv.writer(open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/final.csv", "w"))
g=open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv",'r')
with open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv", 'r') as infh:
data = csv.reader(infh)
next(data) # skip header
seen = defaultdict(set)
counts = Counter(
row[sn]
for row in data
if row[sn] and row[p_f] not in seen[row[sn]] and not seen[row[sn]].add(row[sn])
)
print(counts.most_common())
#want to count instances of the number 2 in [('VFGRP15040030', 2), ('VFGRP15370118', 2), ('VFGRP15150113', 2)]
x=len(list(csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv'))))
print('# of rows including header=');print(x)
count_pass = sum(1 for row in csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv')) if row[1] =='pass')
print('# of passes=');print(count_pass)
count_fail = sum(1 for row in csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv')) if row[1] =='fail')
print('# of fails=');print(count_fail)
#count_retest = ??
g.close
#f.close
# to get duplicates and their frequency for a column
from collections import Counter
from operator import itemgetter
with open('data.csv', 'r', newline='') as f:
r = csv.reader(f)
# here we take as example column number 1
cn = Counter(map(itemgetter(1), r))
# print item that appears more than once in the column
for k, v in cn.items():
if v > 1:
print(k,v)

Print lines with constraint in Python

Suppose we have the following text file with column a and column b:
D000001 T109
D000001 T195
D000002 T115
D000002 T131
D000003 T073
D000004 T170
I wonder how to produce the following structure:
D000001 T109 T195
D000002 T115 T131
D000003 T073
D000004 T170
Pasted below is initial skeleton in Python.
from __future__ import print_function
with open('descr2semtype_short.txt') as f:
for line in f:
line = line.rstrip()
a, b = line.split()
print(a + ' ' + b)
You can use itertools.groupby:
import itertools, operator
with open('descr2semtype_short.txt') as f:
for key, items in itertools.groupby(
(line.rstrip().split(None,1) for line in f),
operator.itemgetter(0)):
print(key, ' '.join(item[1] for item in items))
which gives the desired output:
D000001 T109 T195
D000002 T115 T131
D000003 T073
D000004 T170
Instead of printing them there, you can keep a dictionary of the lines , with the first element of the line as the key and the second element as value (as a list , so that if another element comes for same key you can append to it).
And then print them at the end.
Example -
from __future__ import print_function
d = {}
with open('descr2semtype_short.txt') as f:
for line in f:
line = line.rstrip()
a, b = line.split()
if a not in d:
d[a] = []
d[a].append(b)
for k,v in d.iteritems():
print(k + ' ' + ' '.join(v))
From Python 2.7 onwards, If the order of the lines is important, then instead of Dictionary , we can use OrderedDict .
Example -
from __future__ import print_function
from collections import OrderedDict
d = OrderedDict()
with open('descr2semtype_short.txt') as f:
for line in f:
line = line.rstrip()
a, b = line.split()
if a not in d:
d[a] = []
d[a].append(b)
for k,v in d.items():
print(k + ' ' + ' '.join(v))
I would do it with OrderedDict , this way:
from collections import OrderedDict
d = OrderedDict()
with open('1.txt', 'r') as f:
for line in f:
a,b = line.strip().split()
print a,b
if a not in d:
d[a] = [b]
else:
d[a].append(b)
print d
Output:
OrderedDict([('D000001', ['T109', 'T109', 'T195']), ('D000002', ['T115', 'T115', 'T131']), ('D000003', ['T073', 'T073']), ('D000004', ['T170', 'T170', 'T175', 'T180'])])

Converting text file to dictionary in python

So lets say I want to convert the following to a dictionary where the 1st column is keys, and 2nd column is values.
http://pastebin.com/29bXkYhd
The following code works for this (assume romEdges.txt is the name of the file):
f = open('romEdges.txt')
dic = {}
for l in f:
k, v = l.split()
if k in dic:
dic[k].extend(v)
else:
dic[k] = [v]
f.close()
OK
But why doesn't the code work for this file?
http://pastebin.com/Za0McsAM
If anyone can tell me the correct code for the 2nd text file to work as well I would appreciate it.
Thanks in advance.
You should use append instead of extend
from collections import defaultdict
d = defaultdict(list)
with open("romEdges.txt") as fin:
for line in fin:
k, v = line.strip().split()
d[k].append(v)
print d
or using sets to prevent duplicates
d = defaultdict(set)
with open("romEdges.txt") as fin:
for line in fin:
k, v = line.strip().split()
d[k].add(v)
print d
If you want to append the data to dictionary, then you can use update in python. Please use following code:
f = open('your file name')
dic = {}
for l in f:
k,v = l.split()
if k in dic:
dict.update({k:v })
else:
dic[k] = [v]
print dic
f.close()
output:
{'0100464': ['0100360'], '0100317': ['0100039'], '0100405': ['0100181'], '0100545': ['0100212'], '0100008': ['0000459'], '0100073': ['0100072'], '0100044': ['0100426'], '0100062': ['0100033'], '0100061': ['0000461'], '0100066': ['0100067'], '0100067': ['0100164'], '0100064': ['0100353'], '0100080': ['0100468'], '0100566': ['0100356'], '0100048': ['0100066'], '0100005': ['0100448'], '0100007': ['0100008'], '0100318': ['0100319'], '0100045': ['0100046'], '0100238': ['0100150'], '0100040': ['0100244'], '0100024': ['0100394'], '0100025': ['0100026'], '0100022': ['0100419'], '0100009': ['0100010'], '0100020': ['0100021'], '0100313': ['0100350'], '0100297': ['0100381'], '0100490': ['0100484'], '0100049': ['0100336'], '0100075': ['0100076'], '0100074': ['0100075'], '0100077': ['0000195'], '0100071': ['0100072'], '0100265': ['0000202'], '0100266': ['0000201'], '0100035': ['0100226'], '0100079': ['0100348'], '0100050': ['0100058'], '0100017': ['0100369'], '0100030': ['0100465'], '0100033': ['0100322'], '0100058': ['0100056'], '0100013': ['0100326'], '0100036': ['0100463'], '0100321': ['0100320'], '0100323': ['0100503'], '0100003': ['0100004'], '0100056': ['0100489'], '0100055': ['0100033'], '0100053': ['0100495'], '0100286': ['0100461'], '0100285': ['0100196'], '0100482': ['0100483']}

Categories