I've two text files and contains data like this
I WANT this to be done in hadoop. Can any one suggest me the way ?
textfile1 --> 1 goerge hyder
2 ganesh singapore
textfile2 --> 1 goergy hydel
2 ganest singapore
It has to do comparission coulmn by column and character by character so after comparission it should give report as
column_name source destiny mismatch
xxx george georgy y
ganesh ganest h
hyder hydel r
Please help me in this.
f = open('textfile1.txt', 'a').readlines()
for n in f:
text1 = n.rstrip()
n = open('textfile2.txt', 'a').readlines()
for l in n:
text2 = l.rstrip()
if text1 == text2:
print("It Is the Same Thing")
report = open('report.txt')
report.write('It is The Same Thing with the text 1 and 2')
report.write('\n')
else:
print("it Is Not The Same Thing")
report = open('report.txt')
report.write('It is Not The Same Thign With the text 1 and 2')
report.write('\n')
with open(textfile1,"r") as f1:
with open(textfile2,"r") as f2:
words1 = f1.read().split(" ")
words2 = f2.read().split(" ")
#considering f1 and f2 have the same number of words
for i in range(len(words1)):
if words1[i] != words2[i]:
for j in range(len(words1[i])):
if words1[i][j] != words2[i][j]:
print(words1[i],words2[i],words2[i][j])
As Seer.The mentioned above, you can use difflib.
import difflib
# Read the files
f = open('textfile1.txt', 'r').readlines()
list1 = []
for n in f:
text = n.rstrip().split(" ")
list1.append(text)
f = open('textfile2.txt', 'r').readlines()
list2 = []
for n in f:
text = n.rstrip().split(" ")
list2.append(text)
# Get the output
for ii in range(len(list1)):
for jj in range(len(list1[0])):
output_list = [li[-1]
for li in list(difflib.ndiff(list1[ii][jj], list2[ii][jj]))
if "-" in li]
if output_list == []:
output_list = ["no difference"]
print "{} {} {}".format(list1[ii][jj], list2[ii][jj], output_list[0])
The output should look like:
goerge goergy e
hyder hydel r
ganesh ganest h
singapore singapore no difference
Related
Eg:FileContains
TextAA
Content
TextXy
Content
TextXX
Expected:
Text1
Content
Text2
Content
Text3
f = open('input.txt', 'r')
for x in f:
count = 0
l = x.split()
for i, x in enumerate(l):
if 'Text' in x:
count += 1
l[i] = ''.join(["Text", str(count)])
s = ' '.join(l)
with open('output.txt', 'a') as output:
output.write(s + '\n')
i have this assignment in a basic programming course where i need to transform this code using while loop instead of for loop, but i dont know how to doit
this is my code so far
def read_txt(file_txt):
file = open(file_txt, "r")
lines = file.readlines()
file.close()
return lines
file_txt = input("file: ")
lines = read_txt(file_txt)
for l in lines:
asd = l.split(",")
length = len(asd)
score = 0
for i in range(1, length):
score += int(asd[i])
average = score / (length-1)
print(asd[0], average)
file text is like this
edward,4,3,1,2
sara,5,4,1,0
def read_txt(file_txt):
file = open(file_txt, "r")
lines = file.readlines()
file.close()
return lines
file_txt = input("file: ")
lines = read_txt(file_txt)
lines.reverse()
while lines:
l = lines.pop()
asd = l.split(",")
length = len(asd)
score = 0
i = 1
while i < length:
score += int(asd[i])
i += 1
average = score / (length-1)
print(asd[0], average)
Now in this while loop, it will iterate through lines until lines is empty. it will pop out items one by one.
For loops are more suitable for iterating over lines in files than while loops. Few improvements here are, (1) use the builtin sum instead of manually adding up scores, and (2) don't read all lines in file at once if the files are too big.
file_txt = input("file: ")
with open(file_txt) as f:
while True:
line = f.readline()
if not line:
break
name, scores = line.split(',', maxsplit=1)
scores = scores.split(',')
avg = sum(int(s) for s in scores) / len(scores)
print(f'{name} {avg}')
As you see above the check for if not line to determine if we have reached the end of file in a while loop, this is not needed in for loop as that implements the __iter__ protocol.
Python 3.8 walrus operator makes that slightly easier with::
file_txt = input("file: ")
with open(file_txt) as f:
while line := f.readline():
name, scores = line.split(',', maxsplit=1)
scores = scores.split(',')
avg = sum(int(s) for s in scores) / len(scores)
print(f'{name} {avg}')
The following gives the exact same output without using any for loop.
filename = input("file: ")
with open(filename) as f:
f = f.readlines()
n = []
while f:
v = f.pop()
if v[-1] == '\n':
n.append(v.strip('\n'))
else:
n.append(v)
d = {}
while n:
v = n.pop()
v = v.split(',')
d[v[0]] = v[1:]
d_k = list(d.keys())
d_k.sort(reverse=True)
while d_k:
v = d_k.pop()
p = d[v]
n = []
while p:
a = p.pop()
a = int(a)
n.append(a)
print(str(v), str(sum(n)/len(n)))
Output:
edward 2.5
sara 2.5
I am currently taking an introduction class to python scripting and am in need of help. The below variables are given
The file name -> P = test.txt
firstname -> F = 'Ruthy'
lastname -> L = 'Anderson'
new birthday-> B = 00000000
The objective is to search the file for the given first and last name and then replace the current birthday that is there with the 'B' variable. We are told that each record has a fixed length of 40.
Below is what the test.txt file contains
Adam Smith 11111985Theodore Anderson 03201990Monty Biscuit-Barrel 10181980Adam Smithers 11111900Ruthy Anderson 06062010
This is the code that I have thus far.
file1 = open(P, 'r')
data = file1.read()
file1.close()
file2 = open(P, 'w')
recordLength = 40
start = 0
records = []
while((len(data) - start) >= recordLength):
record = data[start:start + recordLength]
records.append(record)
start+= recordLength
for i in range(0, len(records)):
if re.seatch(F, data) and re.search(L, data):
data = re.sub('10101960', B, data)
file2.write(data)
file2.close()
IIUC my approach would be the following:
# P = 'test.txt'
# F = 'Ruthy'
# L = 'Anderson'
# B = '00000000'
with open(P, 'r+') as f:
while True:
rec = f.read(40)
if rec:
if rec.startswith(f'{F} {L}'):
pos = f.tell()
f.seek(pos-8)
f.write(B)
break
else:
break
I have a text file containing these lines for example
[:];#;;]wqwww actualnumber 1234 ;;:###
aaaa ''3# allnumber 9876
///qqq |||)))
]][]: best 56
I want to get the value 1234,9876,56. Like this(desired output)
1234
9876
56
I tried with the following script but it did not print out anything
with open("test.txt", "r") as f:
lines = f.readlines()
stripped_lines = [line.strip() for line in lines]
word = ["actual number", "potential", "time"]
if any(item in stripped_lines for item in word):
aa = stripped_lines.split("actualnumber ")[1].split(" ")[0]
bb = stripped_lines.split("allnumber ")[1].split(" ")[1]
cc = stripped_lines.split("best ")[1]
print aa
print bb
print cc
Did I miss something?
you can do it with re module
import re
with open('f.txt') as f:
data = f.read()
act = re.findall(r'actualnumber\s+(\d+)',data)
best = re.findall(r'best\s(\d+)',data)
allnumber = re.findall(r'allnumber\s(\d+)',data)
print "actualnumber : ", act[0] if act else None
print "allnumber : ", allnumber[0] if allnumber else None
print "best : ", best[0] if best else None
output
actualnumber : 1234
allnumber : 9876
best : 56
The simple way is by using isdigit()
f = open('test.txt')
data = f.read()
[int(s) for s in data.split() if s.lstrip("-").isdigit()]
Output :
[1234, 9876, 56]
word = ["actualnumber", "potential", "time"]
with open("./abx.txt") as file:
temp_list = [line.strip() for line in file.readlines()]
list_val = [item for item in temp_list if any(x in item for x in word)]
final_list = []
match_value = ["actualnumber", "allnumber", "best"]
if list_val:
for val in temp_list:
val_list = val.split(" ")
for key, value in enumerate(val_list):
if value in match_value:
final_list.append(val_list[key+1])
print(final_list):
And at the end you can iterate list and find out your expected values.
You can extract numbers like this:
k = '[:];#;;]wqwww actualnumber 1234 ;;:###'
for item in k.split():
if item.isnumeric():
print(item)
You would use your striped lines instead. For each line in stripped line, split it, and check if any item in the split line is numeric. isnumeric() works only on unicode objects.
with open("test.txt", "r") as f:
lines = f.readlines()
stripped_lines = [line.strip() for line in lines]
words = ['actualnumber', 'allnumber', 'best']
found = {}
for line in stripped_lines:
current_line = line.split()
for position, item in enumerate(current_line):
if item in words:
found[item] = current_line[position + 1]
Now that you have them in a dictionary, you can access them as: found['actualnumber']. And do further processing, such as storing them in a database.
I have 2 text files that look like the ones below:
input.txt
shoes
memory card
memory card
input1.txt
shoes
shoes
i want to store the data in a dictionary query will be my key. and count will be my value.input.txt and input1.txt is my two input.in the format word : [count of word in input.txt,count of word in input1.txt]
like below:-
op:-
shoes:[1,2]
memory card:[2,0]
Here is my approach with normal loops:
i = 'shoes\nmemory card\nmemory card'
i2 = 'shoes\nshoes'
dict = {}
i = i.split('\n')
i2 = i2.split('\n')
for ele in i:
if ele not in dict:
dict[ele] = [1,0]
else:
dict[ele][0] +=1
for ele in i2:
if ele not in dict:
dict[ele] = [0,1]
else:
dict[ele][1] +=1
print dict
You could do as follows:
from collections import Counter
with open('input.txt', 'r') as f:
lines1 = [l.strip() for l in f]
with open('input1.txt', 'r') as f:
lines2 = [l.strip() for l in f]
# count the number of occurrences of your words in each file
c1 = Counter(lines1)
c2 = Counter(lines2)
out_dict = {}
# for each unqiue word from both files,
#put its count int the out_dict.
for k in set(c1) | set(c2) :
out_dict[k] = [c1.get(k,0), c2.get(k,0)]
print(out_dict)
The result is:
{'shoes': [1, 2], 'memory card': [2, 0]}
actually i am getting data from some json file.
import os
import json
auto_prefix = u'\u200B'
path1 = "/home/mine/new-log/1-search-logs-01-20/"
dict = {}
for path,dirs,files in os.walk(path1):
for data in files:
with open("/home/mine/new-log/1-search-logs-01-20/"+data) as text:
for line in text:
file2 = json.loads(line)
try:
query = (file2["appData"]["_request_params"]["params"]["q"])[0]
if auto_prefix in query:
query = query.replace(auto_prefix,"")
query = query.encode('utf-8').lower()
if query not in dict:
dict[query] = [1,0]
else:
dict[query][0] += 1
else:
query = query.encode('utf-8').lower()
if query not in dict:
dict[query] = [0,1]
else:
dict[query][1] += 1
except KeyError:
pass
print dict