import pandas as pd
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('env_sequences.csv')
Namedf = df['Name']
Uniprotdf = df['ID']
for row in Uniprotdf:
theurl = 'https://www.uniprot.org/uniprot/' + row + '.fasta'
page = requests.get(theurl).content
for row in Namedf:
fasta = open(row + '.txt', 'w')
fasta.write(page)
fasta.close()
#Sample website: https://www.uniprot.org/uniprot/P04578.fasta
I have a .csv file, from which I am using the column 'ID' to generate links to websites from which I want to download the content and save it as the corresponding name from the 'Name' column within the same .csv.
The code ceases to work after the second for loop in which I get a TypeError for trying to use the page variable within the fasta.write() function. Yet, If I print(page) I am able to output the text that I'm looking to have in each file. Is this a case of me having to convert html into a string? I am unsure how to proceed from here.
For the given url, if you print the content of the page, you'll notice that it has 'b'' which indicates it's in binary format.
print (page)
b'>sp|P04578|ENV_HV1H2 Envelope glycoprotein gp160 OS=Human immunodeficiency virus type 1 group M subtype B (isolate HXB2) OX=11706 GN=env PE=1 SV=2\nMRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWKEATTTLFCASDAKA\nYDTEVHNVWATHACVPTDPNPQEVVLVNVTENFNMWKNDMVEQMHEDIISLWDQSLKPCV\nKLTPLCVSLKCTDLKNDTNTNSSSGRMIMEKGEIKNCSFNISTSIRGKVQKEYAFFYKLD\nIIPIDNDTTSYKLTSCNTSVITQACPKVSFEPIPIHYCAPAGFAILKCNNKTFNGTGPCT\nNVSTVQCTHGIRPVVSTQLLLNGSLAEEEVVIRSVNFTDNAKTIIVQLNTSVEINCTRPN\nNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHCNISRAKWNNTLKQIASKLREQFGNNKTII\nFKQSSGGDPEIVTHSFNCGGEFFYCNSTQLFNSTWFNSTWSTEGSNNTEGSDTITLPCRI\nKQIINMWQKVGKAMYAPPISGQIRCSSNITGLLLTRDGGNSNNESEIFRPGGGDMRDNWR\nSELYKYKVVKIEPLGVAPTKAKRRVVQREKRAVGIGALFLGFLGAAGSTMGAASMTLTVQ\nARQLLSGIVQQQNNLLRAIEAQQHLLQLTVWGIKQLQARILAVERYLKDQQLLGIWGCSG\nKLICTTAVPWNASWSNKSLEQIWNHTTWMEWDREINNYTSLIHSLIEESQNQQEKNEQEL\nLELDKWASLWNWFNITNWLWYIKLFIMIVGGLVGLRIVFAVLSIVNRVRQGYSPLSFQTH\nLPTPRGPDRPEGIEEEGGERDRDRSIRLVNGSLALIWDDLRSLCLFSYHRLRDLLLIVTR\nIVELLGRRGWEALKYWWNLLQYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQGACRAI\nRHIPRRIRQGLERILL\n'
Changing the 'w' to 'wb' while opening the file should fix it. Also, using with open () is the more pythonic way of handling files.
for row in Namedf:
with open ('url.txt','wb') as fasta:
file.write(page)
Related
Please correct me if I am wrong as I am a beginner in python.
I have a web services URL which contains an XML file:
http://abc.tch.xyz.edu:000/patientlabtests/id/1345
I have a list of values and I want to append each value in that list to the URL and download file for each value and the name of the downloaded file should be the same to the value appended from the list.
It is possible to download one file at a time but I have 1000's of values in the list and I was trying to write a function with a for loop and I am stuck.
x = [ 1345, 7890, 4729]
for i in x :
url = http://abc.tch.xyz.edu:000/patientlabresults/id/{}.format(i)
response = requests.get(url2)
****** Missing part of the code ********
with open('.xml', 'wb') as file:
file.write(response.content)
file.close()
The files downloaded from URL should be like
"1345patientlabresults.xml"
"7890patientlabresults.xml"
"4729patientlabresults.xml"
I know there is a part of the code which is missing and I am unable to fill in that missing part. I would really appreciate if anyone can help me with this.
Accessing your web service url seem not to be working. Check this.
import requests
x = [ 1345, 7890, 4729]
for i in x :
url2 = "http://abc.tch.xyz.edu:000/patientlabresults/id/"
response = requests.get(url2+str(i)) # i must be converted to a string
Note: When you use 'with' to open a file, you do not have close the file since it will closed automatically.
with open(filename, mode) as file:
file.write(data)
Since the Url you provide is not working, I am going to use a different url. And I hope you get the idea and how to write to a file using the custom name
import requests
categories = ['fruit', 'car', 'dog']
for category in categories :
url = "https://icanhazdadjoke.com/search?term="
response = requests.get(url + category)
file_name = category + "_JOKES_2018" #Files will be saved as fruit_JOKES_2018
r = requests.get(url + category)
data = r.status_code #Storing the status code in 'data' variable
with open(file_name+".txt", 'w+') as f:
f.write(str(data)) # Writing the status code of each url in the file
After running this code, the status codes will be written in each of the files. And the file will also be named as follows:
car_JOKES_2018.txt
dog_JOKES_2018.txt
fruit_JOKES_2018.txt
I hope this gives you an understanding of how to name the files and write into the files.
I think you just want to create a path using str.format as you (almost) are for the URL. maybe something like the following
import os.path
x = [ 1345, 7890, 4729]
for i in x:
path = '1345patientlabresults.xml'.format(i)
# ignore this file if we've already got it
if os.path.exists(path):
continue
# try and get the file, throwing an exception on failure
url = 'http://abc.tch.xyz.edu:000/patientlabresults/id/{}'.format(i)
res = requests.get(url)
res.raise_for_status()
# write the successful file out
with open(path, 'w') as fd:
fd.write(res.content)
I've added some error handling and better behaviour on retry
I am opening an excel spreadsheet, reading the data in column b and using that data performing a URL lookup and regex on the page then saving the data found into columns 3 and 4 on the same spreadsheet. Everything works okay but it's only writing the last row. I think because of how my for loops are structured, I have searched numerous topics on here but haven't been able to fathom it yet.
import os
from openpyxl import load_workbook
import urllib
import re
from urllib2 import urlopen
directoryPath=r'C:\Python27' #The main folder
os.chdir(directoryPath)
folder_list=os.listdir(directoryPath)
for folders, sub_folders, file in os.walk(directoryPath): #Traversing the sub folders
for name in file:
if name.endswith(".xlsx"):
filename = os.path.join(folders, name)
wb=load_workbook(filename, data_only=True)
ws=wb.active
cell_range = ws['B2':'B4']
f = urllib.urlopen(cell.value)
s = f.read()
for row in cell_range: # This is iterating through rows 1-7
for cell in row: # This iterates through the columns(cells) in that row
ws.cell(row=ws._current_row, column=3).value= str(re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s))
ws.cell(row=ws._current_row, column=4).value= str(re.findall(r"\(?(?:(?:0(?:0|11)\)?[\s-]?\(?|\+)44\)?[\s-]?\(?(?:0\)?[\s-]?\(?)?|0)(?:\d{2}\)?[\s-]?\d{4}[\s-]?\d{4}|\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4}|\d{4}\)?[\s-]?(?:\d{5}|\d{3}[\s-]?\d{3})|\d{5}\)?[\s-]?\d{4,5}|8(?:00[\s-]?11[\s-]?11|45[\s-]?46[\s-]?4\d))(?:(?:[\s-]?(?:x|ext\.?\s?|\#)\d+)?)",s))
wb.save('scraper.xlsx')
example urls to test with
http://www.knaptoninsurance.co.uk/contact-us
http://www.sterlingsafetywear.co.uk/contact
http://www.ilu.org.uk/contact.html
JSON data output when printed in command line I am currently pulling data via an API and am attempting to write the data into a CSV in order to run calculations in SQL. I am currently able to pull the data, open the CSV, however an error occurs when the data is being written into the CSV. The error is that each individual character is separated by a comma.
I am new to working with JSON data so I am curious if I need to perform an intermediary step between pulling the JSON data and inserting it into a CSV. Any help would be greatly appreciated as I am completely stuck on this (even the data provider does not seem to know how to get around this).
Please see the code below:
import requests
import time
import pyodbc
import csv
import json
headers = {'Authorization': 'Token'}
Metric1 = ['Website1','Website2']
Metric2 = ['users','hours','responses','visits']
Metric3 = ['Country1','Country2','Country3']
obs_list = []
obs_file = r'TEST.csv'
with open(obs_file, 'w') as csvfile:
f=csv.writer(csvfile)
for elem1 in Metric1:
for elem2 in Metric2:
for elem3 in Metric3:
URL = "www.data.com"
r = requests.get(URL, headers=headers, verify=False)
for elem in r:
f.writerow(elem) `
Edit: When I print the data instead of writing it to a CSV, the data appears in the command window in the following format:
[timestamp, metric], [timestamp, metric], [timestamp, metric] ...
Timestamp = 12 digit character
Metric = decimal value
I think this block of code is pretty close to being right, but something is throwing it off. I'm trying to loop through 10 URLs and download the contents of each to a text file, and make sure everything is structured orderly, in a dataframe.
import pandas as pd
rawHtml = ''
url = r'http://www.pga.com/golf-courses/search?page=" + i + "&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0'
g = open("C:/Users/rshuell001/Desktop/MyData.txt", "w")
for i in range(0, 10):
df = pd.DataFrame.from_csv(url)
print(df)
g.write(str(df))
g.close()
The error that I get says:
CParserError: Error tokenizing data.
C error: Expected 1 fields in line 22, saw 2
I have no idea what that means. I only have 9 lines of code, so I don't know why it's mentioning a problem on line 22.
Can someone give me a push to get this working?
pandas.DataFrame.from_csv() takes a first argument which is either a path or a file-like handle, where either are supposed to be pointing at valid CSV file.
You are providing it with a URL.
It seems that you want to use a different function: the top-level pandas.read_csv. This function will actually fetch the data from you from a valid URL, then parse it.
If for any reason you insist on using pandas.DataFrame.from_csv(), you will have to:
Get the text from the page.
Persist the text, or parts thereof, as a valid CSV file, or a file-like object.
Provide the path to the file, or the handler of the file-like, as the first argument to pandas.DataFrame.from_csv().
I finally got it working. This is what I was trying to do all along.
import requests
from bs4 import BeautifulSoup
link = "http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
html = requests.get(link).text
soup = BeautifulSoup(html, "lxml")
res = soup.findAll("div", {"class": "views-field-nothing"})
for r in res:
print("Address: " + r.find("span", {'class': 'field-content'}).text)
I am trying to write a script (Python 2.7.11, Windows 10) to collect data from an API and append it to a csv file.
The API I want to use returns data in json.
It limits the # of displayed records though, and pages them.
So there is a max number of records you can get with a single query, and then you have to run another query, changing the page number.
The API informs you about the nr of pages a dataset is divided to.
Let's assume that the max # of records per page is 100 and the nr of pages is 2.
My script:
import json
import urllib2
import csv
url = "https://some_api_address?page="
limit = "&limit=100"
myfile = open('C:\Python27\myscripts\somefile.csv', 'ab')
def api_iterate():
for i in xrange(1, 2, 1):
parse_url = url,(i),limit
json_page = urllib2.urlopen(parse_url)
data = json.load(json_page)
for item in data['someobject']:
print item ['some_item1'], ['some_item2'], ['some_item3']
f = csv.writer(myfile)
for row in data:
f.writerow([str(row)])
This does not seem to work, i.e. it creates a csv file, but the file is not populated. There is obviously something wrong with either the part of the script which builds the address for the query OR the part dealing with reading json OR the part dealing with writing query to csv. Or all of them.
I have tried using other resources and tutorials, but at some point I got stuck and I would appreciate your assistance.
The url you have given provides a link to the next page as one of the objects. You can use this to iterate automatically over all of the pages.
The script below gets each page, extracts two of the entries from the Dataobject array and writes them to an output.csv file:
import json
import urllib2
import csv
def api_iterate(myfile):
url = "https://api-v3.mojepanstwo.pl/dane/krs_osoby"
csv_myfile = csv.writer(myfile)
cols = ['id', 'url']
csv_myfile.writerow(cols) # Write a header
while True:
print url
json_page = urllib2.urlopen(url)
data = json.load(json_page)
json_page.close()
for data_object in data['Dataobject']:
csv_myfile.writerow([data_object[col] for col in cols])
try:
url = data['Links']['next'] # Get the next url
except KeyError as e:
break
with open(r'e:\python temp\output.csv', 'wb') as myfile:
api_iterate(myfile)
This will give you an output file looking something like:
id,url
1347854,https://api-v3.mojepanstwo.pl/dane/krs_osoby/1347854
1296239,https://api-v3.mojepanstwo.pl/dane/krs_osoby/1296239
705217,https://api-v3.mojepanstwo.pl/dane/krs_osoby/705217
802970,https://api-v3.mojepanstwo.pl/dane/krs_osoby/802970