Moving contents from one file to a new one in python

Moving contents from one file to a new one in python - python

i want to print in the even file all even numbers with spaces between them eg: 12 6 20 10 not 1262010 with no spaces in front or back. How can i do this?
def write_positive_even_to_file(filename):
with open(filename, 'r') as orginal, open('xxx.txt', 'a') as even:
red = orginal.read().split()
for number in red:
if number % 2 == 0:
even.write(number + " ")
Input file:
15 12 6
7 20 9 10
13 17
3

You need to split each input line into tokens (assumed to represent integers) convert to int then determine if any value is even.
Something like this:
def write_positives(infile, outfile, mode='a'):
with open(infile) as fin, open(outfile, mode) as fout:
if (evens := [x for x in map(int, fin.read().split()) if x % 2 == 0]):
print(*evens, file=fout)
By printing the unpacked list you will, by default, have space separation

Related

Separating Values from a file in Python

So let's say I have this txt file formatted (value)(space)(value) and there's a second set of numbers separated with a (tab). An example file is given here:
Header
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
Now I'm using this code to print all the values shown in the txt file:
NEWLINE = "\n"
def readBoardFromFile():
inputFileOK = False
aBoard = []
while (inputFileOK == False):
try:
inputFileName = input("Enter the name of your file: ")
inputFile = open(inputFileName, "r")
print("Opening File " + inputFileName + "for reading")
currentRow = 0
next(inputFile)
for line in inputFile:
aBoard.append([])
for ch in line:
if (ch != NEWLINE):
aBoard[currentRow].append(ch)
currentRow = currentRow + 1
inputFileOK = True
print("Completed reading of file " + inputFileName)
except IOError:
print("Error: File couldn't be opened")
numRows = len(aBoard)
numColumns = len(aBoard[0])
return(aBoard,numRows,numColumns)
def display(aBoard, numRows, numColumns):
currentRow = 0
currentColumn = 0
print("DISPLAY")
while (currentRow < numRows):
currentColumn = 0
while (currentColumn < numColumns):
print("%s" %(aBoard[currentRow][currentColumn]), end="")
currentColumn = currentColumn + 1
currentRow = currentRow + 1
print()
for currentColumn in range (0,numColumns,1):
print("*", end ="")
print(NEWLINE)
def start():
aBoard,numRows,numColumns = readBoardFromFile()
display(aBoard,numRows,numColumns)
start()
Normally when I run this code this is the output:
DISPLAY
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
*******************
How do I make it so that the output is:
DISPLAY
5 5 6 7 8
7 6 3 4 1
8 7 4 1 3
Only displaying the numbers in the left half?

Perhaps you can try using the csv module and open the file with tab delimiter.
Then assuming you go with the list approach you could only print the first element from each row.
Something like:
import csv
with open("a.txt") as my_file:
reader = csv.reader(my_file, delimiter ='\t')
next(reader) # to skip header if exists
for line in reader:
print(line[0])

From what I can see you aren't taking the tab character into account in your code which is probably why you have the additional characters in your output.
Here's the approach I would take, utilising Python's power when it comes to processing strings.
I would encourage you to use this type of approach when writing Python, as it will make your like much easier.
NEWLINE = "\n"
def read_board_from_file():
input_file_OK = False
a_board = []
while not input_file_OK:
try:
input_file_name = input("Enter the name of your file: ")
with open(input_file_name, "r") as input_file:
# A file-like object (as returned by open())
# can be simply iterated
for line in input_file:
# Skip header line if present
# (not sure how you would want to handle the
# header.
if "header" in line.lower():
continue
# Strip NEWLINE from each line,
# then split the line at the tab character.
# See comment above.
parts = line.strip(NEWLINE).split("\t")
# parts is a list, we are only interested
# in the first bit.
first_part = parts[0]
# Split the left part at all whitespaces.
# I'm assuming that this is what you want.
# A more complex treatment might make sense here,
# depending on your use-case.
entries = first_part.split()
# entries is a list, so we just need to append it
# to the board
a_board.append(entries)
input_file_OK = True
print(f"Completed reading of file {input_file_name}")
except IOError:
print("Error: File {input_file_name} couldn't be opened.")
return a_board
def display_board(a_board):
print("DISPLAY")
longest_row = 0
# a_board is a list of lists,
# no need to keep track of the number of rows and columns,
# we can just iterate it.
for row in a_board:
# row is a list of entries, we can use str.join() to add a space
# between the parts and format the row nicely.
row_str = " ".join(row)
# At the same time we can keep track of the longest row
# for printing the footer later.
len_row_str = len(row_str)
if len_row_str > longest_row:
longest_row = len_row_str
print(row_str)
# The footer is simply the asterisk character
# printed as many times as the longest row.
footer = "*" * longest_row
print(footer, end="")
print(NEWLINE)
def start():
a_board = read_board_from_file()
display_board(a_board)
start()

I'd do this with a bit more separation between inputs, outputs, and data processing. Your input is the name of the file, and a secondary input is the actual file contents. The data processing step is taking the file contents, and returning some internal representation of a collection of boards. The output is displaying the first board.
from typing import Iterable, List
def parse(lines: Iterable[str], board_sep: str = "\t") -> List[List[List[str]]]:
boards = []
for i, line in enumerate(lines):
# list of the same line from each board
board_lines = line.split(board_sep)
if i == 0:
# this only happens once at the start
# each board can be a list of lists. so boards is a list of "list of lists"
# we're going to append lines to each board, so we need some initial setup
# of boards
boards = [[] for _ in range(len(board_lines))]
for board_idx, board_line in enumerate(board_lines):
# then just add each line of each board to the corresponding section
boards[board_idx].append(board_line.split())
return boards
def show_board(board: List[List[str]]) -> None:
for row in board:
print(" ".join(row))
Now we can put that all together. We need to:
Get the filename
Open the file
Filter out the "Header" and any blank lines
Pass the rest of the lines to the parse() function
Get the first board
Print it with the context
from typing import Tuple
def get_board_dimensions(board: List[List[str]]) -> Tuple[int, int]:
""" Returns a tuple of (rows, cols) """
return len(board), len(board[0])
def get_filtered_file(filename: str) -> Iterable[str]:
with open(filename) as f:
for line in f:
if not line or line.lower() == "header":
continue
yield line
def main():
filename = input("Enter the name of your file: ")
filtered_lines = get_filtered_file(filename)
boards = parse(filtered_lines)
# now we can show the first one
b = boards[0]
_, cols = get_board_dimensions(b)
print("DISPLAY")
show_board(b)
print("*" * (2 * cols - 1)) # columns plus gaps

Possible to do this more efficiently (turn compact file to sparse)

I have to read in a file line by line that has indices of where a vector has 1's
so for example:
1 3 9 10
means:
0,1,0,1,0,0,0,0,0,1,1
My goal is to write program that will take each line and print out the full vector with the 0's.
I am able to do this with my current program for a few lines:
#create a sparse vector
list_line_sparse = [0] * int(num_features)
#loop over all the lines
for item in lines:
#split the line on spaces
zz = item.split(' ')
#get all ints on a line
d = [int(x.strip()) for x in zz]
#loop over all ints and change index to 1 in sparse vector
for i in d:
list_line_sparse[i]=1
out_file += (', '.join(str(item) for item in list_line_sparse))
#change back to 0's
for i in d:
list_line_sparse[i]=0
out_file +='\n'
f = open('outfile', 'w')
f.write(out_file)
f.close()
The problem is that for a file with a lot of features and lines, my program is very very inefficient - it basically never finishes. Is there anything sticking out that I should change to make it more efficent? (I.e. the 2 for loops)

It would probably be more efficient to write each line of data to your output file as it is generated, rather than building up a huge string in memory.
numpy is a popular Python module that's good for doing bulk operations on numbers. If you start with:
import numpy as np
list_line_sparse = np.zeros(num_features, dtype=np.uint8)
Then, given d as the list of numbers on the current line, you can simply do:
list_line_sparse[d] = 1
to set ALL of those indexes in the array at the same time, no loop required. (At the Python level at least, obviously there's still a loop involved, but it's down in the C implementation of numpy).

It is slowing down because you are doing string concatenation. It is better to work with lists.
Also you could use csv to read your space separated lines in, and to then write each row with commas automatically added:
import csv
num_features = 20
with open('input.txt', 'r', newline='') as f_input, open('output.txt', 'w', newline='') as f_output:
csv_input = csv.reader(f_input, delimiter=' ')
csv_output = csv.writer(f_output)
for row in csv_input:
list_line_sparse = [0] * int(num_features)
for v in map(int, row):
list_line_sparse[v] = 1
csv_output.writerow(list_line_sparse)
So if input.txt contained the following:
1 3 9 10
1 3 9 11
2 7 3 5
Giving you an output.txt containing:
0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0

Too much loops: first, the item.split(), then the for x in zz, then for i in d, then for item in list_line_sparse, and then for i in d again. Strings concatenations could be your most expensive part: the .join and the output +=. And all this for every line.
You could try a "character by character" parsing and writing. Something like this:
#features per line
count = int(num_features)
f = open('outfile.txt', 'w')
#loop over all lines
for item in lines:
#reset the feature
i = 0
#the characters buffer
index = ""
#parse character by character
for character in item:
#if a space or end of line is found,
#and the characters buffer (index) is not empty
if character in (" ", "\r", "\n"):
if index:
#parse the characters buffer
index = int(index)
#if is not the first feature
if i > 0:
#add the separator
f.write(", ")
#add 0's until index
while i < index:
f.write("0, ")
i += 1
#and write 1
f.write("1")
i += 1
#reset the characters buffer
index = ""
#if is not a space or end on line
else:
#add the character to the buffer
index += character
#if the last line didn't end with a carriage return,
#index could be waiting to be parsed
if index:
index = int(index)
if i > 0:
f.write(", ")
while i < index:
f.write("0, ")
i += 1
f.write("1")
i += 1
index = ""
#fill with 0's
while i < count:
if i == 0:
f.write("0")
else:
f.write(", 0")
i += 1
f.write("\n")
f.close()

Let's rework your code into a simpler package that takes better advantage of Python's features:
import sys
NUM_FEATURES = 12
with open(sys.argv[1]) as source, open(sys.argv[2], 'w') as sink:
for line in source:
list_line_sparse = [0] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = 1
print(*list_line_sparse, file=sink, sep=',')
I revisited this problem with your "more efficiently" in mind. Although the above is more memory efficient, it is a hair slower time-wise. I reconsidered your original and came up with a solution that is less memory efficient but about 2x faster than your code:
import sys
NUM_FEATURES = 12
data = ''
with open(sys.argv[1]) as source:
for line in source:
list_line_sparse = ["0"] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = "1"
data += ",".join(list_line_sparse) + '\n'
with open(sys.argv[2], 'w') as sink:
sink.write(data)
Like your original solution, it stores all the data in memory and writes it out at the end which is both a disadvantage (memory-wise) and an advantage (time-wise.)
input.txt
1 3 9 10
1 3 9 11
2 7 3 5
USAGE
% python3 test.py input.txt output.txt
output.txt
0,1,0,1,0,0,0,0,0,1,1,0
0,1,0,1,0,0,0,0,0,1,0,1
0,0,1,1,0,1,0,1,0,0,0,0

How can I improve the performance of my program that counts the occurrence of numbers in a list?

I am trying to solve the following problem:
Given: A positive integer k≤20k≤20, a positive integer n≤104n≤104, and k arrays of size n containing positive integers not exceeding 10^5.
Return: For each array, output an element of this array occurring strictly more than n/2 times if such element exists, and "-1" otherwise.
My attempt at it is the following:
myFile = open('rosalind_maj.txt', 'r')
outputf = open('output.txt', 'w')
TOTAL_ARRAYS, SIZE = myFile.readline().split()
TOTAL_ARRAYS = int(TOTAL_ARRAYS)
SIZE = int(SIZE)
lines=myFile.readlines()
for i in range(0, TOTAL_ARRAYS):
array = map(int, (lines[i].strip().split()))
occurance = []
output = []
considered = []
for element in array:
if element not in considered:
if (array.count(element)) > SIZE/2:
output.append(element)
considered.append(element)
if len(output) == 0:
print (-1)
outputf.write( str(-1) + " " )
else:
for item in output:
print item
outputf.write(str(item) + " " )
A sample of the input text file looks like the following:
4 8
5 5 5 5 5 5 5 5
8 7 7 7 1 7 3 7
7 1 6 5 10 100 1000 1
5 1 6 7 1 1 10 1
and the correct answer to the sample input is:
5 7 -1 -1
My code works but is too slow. It takes 20 secs for an input with 19 arrays, each of size 8000 elements.

You can use Majority Vote Algorithm. It runs in O(n) time and takes O(1) space!

I tried the following:
#!/usr/bin/env python
from itertools import imap
from collections import defaultdict
with open("rosalind_maj.txt", "r") as in_file:
with open("output.txt", "w") as out_file:
nb_arrays, ar_size = map(int, in_file.readline().split())
majority = ar_size / 2.0
# i is probably useless if the file has the correct number of lines
for i, line in enumerate(in_file, start=1):
counts = defaultdict(int)
found = False
if i > nb_arrays:
break
for element in line.strip().split():
counts[element] += 1
if counts[element] > majority:
out_file.write("%s " % element)
found = True
break
if not found:
out_file.write("-1 ")
On a dataset of a size about the one you said you tested (19 lines with 8000 elements), and on my old 2007 laptop, this runs in about 0.15 seconds whereas your code runs in about 7 seconds (I suppressed the print from it).
The resulting output files are the same.
I'm not sure the reason, but the slowness could be in part due to the append() and testing the presence of elements in a list in your code.

How to make Python read new lines and just lines?

I know that Python can read numbers like:
8
5
4
2
2
6
But I am not sure how to make it read it like:
8 5 4 2 2 6
Also, is there a way to make python read both ways? For example:
8 5 4
2
6
I think reading with new lines would be:
info = open("info.txt", "r")
lines = info.readlines()
info.close()
How can I change the code so it would read downwards and to the sides like in my third example above?
I have a program like this:
info = open("1.txt", "r")
lines = info.readlines()
numbers = []
for l in lines:
num = int(l)
numbers.append(str(num**2))
info.close()
info = open("1.txt", "w")
for num in numbers:
info.write(num + "\n")
info.close()
How can I make the program read each number separately in new lines and in just lines?

Keeping them as strings:
with open("info.txt") as fobj:
numbers = fobj.read().split()
Or, converting them to integers:
with open("info.txt") as fobj:
numbers = [int(entry) for entry in fobj.read().split()]
This works with one number and several numbers per line.
This file content:
1
2
3 4 5
6
7
8 9 10
11
will result in this output for numbers:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
This approach reads the whole file at once. Unless your file is really large this is fine.

info = open("1.txt", "r")
lines = info.readlines()
numbers = []
for line in lines:
for num_str in line.split(' '):
num = int(num_str)
numbers.append(str(num**2))
info.close()

info = open("test.txt", "r")
lines = info.readlines()
numbers = []
for l in lines:
l = l.strip()
lSplit = l.split(' ')
if len(lSplit) == 1:
num = int(l)
numbers.append(str(num**2))
else:
for num in lSplit:
num2 = int(num)
numbers.append(str(num2**2))
print numbers
info.close()

A good way to do this is with a generator that iterates over the lines, and, for each line, yields each of the numbers on it. This works fine if there is only one number on the line (or none), too.
def numberfile(filename):
with open(filename) as input:
for line in input:
for number in line.split():
yield int(number)
Then you can just write, for example:
for n in numberfile("info.txt"):
print(n)

If you don't care how many numbers per line, then you could try this to create the list of the squares of all the numbers.
I have simplified your code a bit by simply iterating over the open file using a with statement, but iterating over the readlines() result will work just as well (for small files - for large ones, this method doesn't require you to hold the whole content of the file in memory).
numbers = []
with open("1.txt", 'r') as f:
for line in f:
nums = line.split()
for n in nums:
numbers.append(str(int(n)**2))

Just another not yet posted way...
numbers = []
with open('info.txt') as f:
for line in f:
numbers.extend(map(int, line.split()))

file_ = """
1 2 3 4 5 6 7 8
9 10
11
12 13 14
"""
for number in file_ .split():
print number
>>
1
2
3
4
5
6
7
8
9
10
11
12
13
14

Reading and editing a font file and using a dictionary

I have to take the values from a text file which contains the co-ordinates to draw characters out in TurtleWorld, an example of the text file is the following:
<character=B, width=21, code=66>
4 21
4 0
-1 -1
4 21
13 21
16 20
17 19
18 17
18 15
17 13
16 12
13 11
-1 -1
4 11
13 11
16 10
17 9
18 7
18 4
17 2
16 1
13 0
4 0
</character>
I have to then write a function to take all of these points and then convert them into a dictionary where a key is the character and the corresponding values are the set of points which can be used to draw that character in TurtleWorld.
The code I have tried is the following:
def read_font():
"""
Read the text from font.txt and convert the lines into instructions for how to plot specific characters
"""
filename = raw_input("\n\nInsert a file path to read the text of that file (or press any letter to use the default font.txt): ")
if len(filename) == 1:
filename = 'E:\words.txt'
words = open(filename, 'r')
else:
words = open(filename, 'r')
while True: # Restarts the function if the file path is invalid
line = words.readline()
line = line.strip()
if line[0] == '#' or line[0] == ' ': # Used to omit the any unwanted lines of text
continue
elif line[0] == '<' and line[1] == '/': # Conditional used for the end of each character
font_dictionary[character] = numbers_list
elif line[0] == '<' and line[1] != '/':

take a look at http://oreilly.com/catalog/pythonxml/chapter/ch01.html :: specifically, hit up the example titled :: Example 1-1: bookhandler.py
you can more or less credit/copy that and tweak it to read your particular xml. once you get the 'guts'(your coords), you can split it into a list of x/y coords really easily
such as
a = "1 3\n23 4\n3 9\n"
coords = map(int,a.split())
and chunk it into a list w/ groups of 2 How do you split a list into evenly sized chunks?
and store the result letters[letter] = result
or you can do the chunking more funky using the re module
import re
a = "1 13\n4 5\n"
b = re.findall("\d+ *\d+",a)
c = [map(int,item.split()) for item in b]
c
[[1, 13], [4, 5]]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Moving contents from one file to a new one in python - python

Related

Separating Values from a file in Python

Possible to do this more efficiently (turn compact file to sparse)

How can I improve the performance of my program that counts the occurrence of numbers in a list?

How to make Python read new lines and just lines?

Reading and editing a font file and using a dictionary

Categories

Resources