Reading file from the internet and split into 2 - python

I am brand new to Python and try the following: I am reading a file from the internet and want to split it at a certain amount of lines.
1. File = line 1 to x
2. File = line x+1 to eof
I use httplib2 to read the file from the internet and split then this file into 2. Tried it with the "with" but it seems that I cannot use f.readline() etc when I am reading a file from the internet and use it with "with". If I open a local file it works fine.
Do I miss something here?
Thank you very much for your help in advance.
with data_file as f: #data_file is the file read from the internet
Here is my function:
def create_data_files(data_file):
# read the file from the internet and split it into two files
# Loading file give info if the file was loaded from cache or internet
try:
print("Reading file from the Internet or Cache")
h = httplib2.Http(".cache")
data_header, data_file = h.request(DATA_URL) # , headers={'cache-control':'no-cache'}) # to force download form internet
data_file = data_file.decode()
except httplib2.HttpLib2Error as e:
print(e)
# Give the info if the file was read from the internet or from the cache
print("DataHeader", data_header.fromcache)
if data_header.fromcache == True:
print("File was read from cache")
else:
print("File was read from the internet")
# Counting the amount of total characters in the file - only for testing
# print("Total amount of characters in the original file", len(data_file)) # just for testing
# Counting the lines in the file
print("Counting lines in the file")
single_line = data_file.split("\n")
for value in single_line:
value =value.strip()
#print(value) # juist for testing - prints all the lines separeted
print("Total amount of lines in the original file", len(single_line))
# Asking the user how many lines in percentage of the total amount should be training data
while True:
#split_factor = int(input("What percentage should be use as training data? Enter a number between 0 and 100: "))
split_factor = 70
print("Split Factor set to 70% for test purposes")
if 0 <= split_factor <= 100:
break
print('try again')
split_number = int(len(single_line)*split_factor/100)
print("Number of Training set data", split_number) # just for testing
# Splitting the file into 2
training_data_file = 0
test_data_file = 0
return training_data_file, test_data_file

from collections import deque
import httplib2
def create_data_files(data_url, split_factor=0.7):
h = httplib2.Http()
resp_headers, content = h.request(data_url, "GET")
# for python3
content = content.decode()
lines = deque(content.split('\n'))
stop = len(lines) * split_factor
training, test = [], []
i = 0
while lines:
l = lines.popleft()
if i <= stop:
training.append(l)
else:
test.append(l)
i +=1
training_str, test_str = '\n'.join(training), '\n'.join(test)
return training_str, test_str
This should do the trick (not tested and simplified).
data_header, data_file = h.request(DATA_URL)
data_file is not a file like object but a string

Related

how to read and verify a text file exists or not in python in while loop?

I'm trying to verify if a file exists or not in the current directory. At the same time, read if the file exists. The below is my code
import os.path
def readdata():
isFileExist = False
while isFileExist == False:
userIN = str(input("Please Enter a file name, followed by .txt: "))
isExist = os.path.exists(userIN)
if isFileExistExist == False:
print(f"Checking... {userIN} DOES NOT exist in the current directory. Please retry.")
else:
print(f"Checking... {userIN} exist in the current directory.")
print(f"\n The file < {userIN} > includes the following data:")
IN = open(userIN,"r")
std1N = IN.readline()
std1M = IN.readline()
std2N = IN.readline()
std2M = IN.readline()
std3N = IN.readline()
std3M = IN.readline()
IN.close()
print(f" Student Name: {std1N.strip()}")
print(f" Performance: {std1M.strip()} out of 100")
print(f" Student Name: {std2N.strip()}")
print(f" Performance: {std2M.strip()} out of 100")
print(f" Student Name: {std3N.strip()}")
print(f" Performance: {std3M.strip()} out of 100")
print(f" The average exam score of the 3.0 students in the file < {userIN} > is {(eval(s1M.strip())+eval(s2M.strip())+eval(s3M.strip()))/3:.2f}.")
def main():
readdata()
if __name__ == '__main__':
main()
How to use main to transfer file name to the function readdata()? And then verify if it exists in the current file directory and read the file. As well as, getting the average 3 student's average in that file. I want to transfer the filename from main to readdata() function. How can I achieve that? Thank you for your time and consideration
There is a couple of methods to check it.
try except
try:
file = open(file_name)
except FileNotFoundError:
# do something when file not exist
pathlib.Path object has exists method.
os.path.isfile method
To read data from file use:
with open(file_name) as f:
for line in f:
# do something with current line
# context manager close file automaticly
or
with open(file_name) as f:
lines = f.readlines()
for line in lines:
# do something with current line
Second way is worse then first because:
first one uses generator
second one uses creates and use list (slower and use more memory)
But in your case you need to read 2 lines per iteration so you can use readline method 2 times per iteration and break it when readline return None.
e.g
with open(file_name) as f:
current_name = f.readline()
current_performance = f.readline()
while current_name is not None:
# do something
current_name = f.readline()
current_performance = f.readline()
You can also extend condition with a and current_performance is not None to be sure that u have both values.

How to print strings from a text file,separated by new line in Python

I'm new in Python and i'm challenging myself by making an online library management with prompt for the 1st phase.I'm stacked in search function.I have found how to print a user's input,but i can't find how to print and the following data.I want to search a book by name.If book's name is in the text,i want to print the details of the book,like author,isbn etc.
Here is the following code i have made:
def search():
search_book = input('Search a book: ')
with open('library.txt', mode='r', encoding='utf-8') as f:
index = 0
for line in f:
index += 1
if search_book in line:
print(f'{search_book} is in line {index}')
for details in range(index,index+5):
print(line[details])
And this is the text file's data:
FIRST
ME
9781234
2000
Science
SECOND
YOU
9791234
1980
Literature
It is separated by new line.As example a user input the name FIRST and the result will be:
FIRST
ME
9781234
2000
Science
There are two file options we can consider,
Csv file - Instead of individual readline, you could use one line for one book entry.
# ---------test.csv -------------
# BookName, ItemCode, Price
# Book1, 00012, 14.55
# Book2, 00232, 55.12
# -----End Csv-------------------
import csv
def read_csv(filename:str):
file_contents = None
# reading csv file
with open(filename, 'r') as csvfile:
file_contents = csv.reader(csvfile)
return file_contents
def search(file_contents, book_name:str):
if not file_contents:
return None
for line in file_contents:
if book_name in line:
return line
if __name__ == '__main__':
file_contents = read_csv('test.csv')
line = search(file_contents, 'ME')
print(line if line else 'No Hit Found')
Json - This is much better option than csv file
import json
def read_json(filename:str) -> dict:
with open(filename) as json_file:
all_books = json.load(json_file)
return all_books
def search(all_books:dict, book_name:str):
for book_id, book_details in all_books.items():
if book_details['Name'] == book_name:
return book_details
return None
if __name__ == '__main__':
all_books = read_json('books.json')
book = search(all_books, 'YOU')
print(book if book else 'Not hit found')
If your file contents can't change, then I will go with #tripleee suggestion above. Good luck.
You are reading a line at a time, and looping over the first line's contents. At this point in the program, there are not yet any additional lines. But a fix is relatively easy:
def search():
search_book = input('Search a book: ')
with open('library.txt', mode='r', encoding='utf-8') as f:
index = 0
print_this_many = 0
for line in f:
index += 1
if search_book in line:
print(f'{search_book} is in line {index}')
print_this_many = 5
if print_this_many:
print(line, end='')
print_this_many -= 1
We don't have the next lines in memory yet, but we can remember how many of them to print as we go ahead and read more of them. The print_this_many variable is used for this: When we see the title we want, we set it to 5 (to specify that this and the next four lines should be printed). Now, each time we read a new line, we check if this variable is positive; if it is, we print the line and decrement the variable. When it reaches zero, we will no longer print the following lines. This allows us to "remember" across iterations of the for loop which reads each new line whether we are in the middle of printing something.
A much better solution is to read the database into memory once, and organize the lines into a dictionary, for example.
def read_lib(filename):
library = dict()
with open(filename) as lib:
title = None
info = []
for line in lib:
line = line.rstrip('\n')
if title is None:
title = line
elif line == '':
if title and info:
library[title] = info
title = None
else:
info.append(line)
def search(title, library):
if title in library:
return library[title]
else:
return None
def main():
my_library = read_lib('library.txt')
while True:
sought = input('Search a book: ')
found = search(sought, my_library)
if found:
print('\n'.join(found))
else:
print('Sorry, no such title in library')

I am trying to create a program that edits text by letting you select from a few things and changes them to one of a few options

import os, re
config_file = "jsm_gyro_config.txt"
#fptr = open(config, "w")
#text = "demo text"
#fptr.write(text)
#fptr.close()
file = open(config_file, 'r')
file-read = file.read()
for line in file-read:
if re.search(userinput, file-read):
x = re.search(userinput, file-read)
# iteminputted is what the user wants to replace
iteminputted = "ref"
startpostion = x.span[1] + 3
endpostion = startposition + len(iteminputted)
# Find out how to write to a specific location in a file that will finish this off
else:
print("Item not found")
This is what i've tried and here is my thought process as always any help is appreatated and please make it understandable for an idiot :(
To begin with, you should not use - in your variable declarations as it is actually an operator and will always be treated as such. It will attempt to subtract.
Here is the same code with that fixed and also with the input
import os, re
config_file = "jsm_gyro_config.txt"
#fptr = open(config, "w")
#text = "demo text"
#fptr.write(text)
#fptr.close()
file = open(config_file, 'r')
file_read = file.read()
file.close() # You should always close your files.
for line in file_read:
if re.search(userinput, file_read):
x = re.search(userinput, file_read)
# iteminputted is what the user wants to replace
iteminputted = input("Input what you would like to replace > ")
startpostion = x.span[1] + 3
endpostion = startposition + len(iteminputted)
# Find out how to write to a specific location in a file that will finish this off
else:
print("Item not found")
However your question is very unclear, I did the best I could.

Counting reads and bases from a list of fastq files

I trimmed my Illumina short reads, forward and reverse, by using Trimmomatic. The Trimmomatic's outputs were: paired_1 - unpaired_1, and paired_2 - unpaired_2.fastq.gz files. I want to know how big was the impact of trimming by counting the number of reads and bases of each file in my directory. I had made a script to count the number of bases and reads for each file in my directory; however, I have problems in if __name__=='__main__'. When I do the for loop I don't know the order of the files that will be run, how can I make it to call the files by the order I see from the screen? Additionally, I also need help with correcting the script as I don't get any stdout.
Thank you in advance for your help.
#!/usr/bin/env python
from sys import argv
import os
def get_num_bases(file_content):
total = []
for linenumber, line in enumerate(file_content):
mod=linenumber%4
if mod==0:
ID = line.strip()[1:]
#print(ID)
if mod==1:
seq = line.strip()
counting = 0
counting += seq.count("T")+ seq.count("A") + seq.count("C") + seq.count("G")
total.append(counting)
allbases = sum(total)
print("Number of bases are: " , allbases)
def get_num_reads(file_content):
total_1 = []
for line in file_content:
num_reads = 0
num_reads += content.count(line)
total_1.append(num_reads)
print("Number of reads are: ", sum(total_1)/int(4))
if __name__=='__main__':
path = os.getcwd()
dir_files = os.listdir(path)
list_files = []
for file in dir_files:
if file.endswith("fastq.gz"):
if file not in list_files:
file_content = open(file, "r").readlines()
list_files.append(file)
print("This is the filename: ", file, get_num_bases(file_content), get_num_reads(file_content))

python replace value string with number

im building a system to sort some .log files in .txt format so that I later can send it to excel. There is 70+ files and in every file im scanning for a keyword, I get 1000+ strings that I want to save in a .txt. I can get every string that I want and see from which .log file each log has ben taken, but now I want to rename the file that the .log came from with a corresponding number 1,2,3,4,5...(one number for every file instead for its file name). code:
import glob
def LogFile(filename, tester):
message = []
data = []
print(filename)
with open(filename) as filesearch: # open search file
filesearch = filesearch.readlines() # read file
i = 1
d = {}
for filename in filename:
if not d.get(filename, False):
d[filename] = i
i += 1
for line in filesearch:
if tester in line: # extract ""
start = '-> '
end = ':\ '
number = line[line.find(start)+3: line.find(end)] #[ord('-> '):ord(' :\ ')]
data.append(number) # store all found wors in array
text = line[line.find(end)+3:]
message.append(text)
with open('Msg.txt', 'a') as handler: # create .txt file
for i in range(len(data)):
handler.write(f"{i}|{data[i]}|{message[i]}")
# open with 'w' to "reset" the file.
with open('Msg.txt', 'w') as file_handler:
pass
# ---------------------------------------------------------------------------------
for filename in glob.glob(r'C:\Users\FKAISER\Desktop\AccessSPA\SPA\*.log'):
LogFile(filename, 'Sending Request: Tester')
I have tried using this function
i = 1
d = {}
for filename in filename:
if not d.get(filename, False):
d[filename] = i
i += 1
but then it looks like this
i want each file to have the same number as in the picture, 1 indicates for all the 26 logs and 2 indicates for the 10 file in that folder... etc

Categories