This is an extension of a question I asked yesterday. I have looked all over StackOverflow and have not found an instance of this specific NameError:
Building DAG of jobs...
Updating job done.
InputFunctionException in line 148 of /home/nasiegel/2022-h1n1/Snakefile:
Error:
NameError: free variable 'combinator' referenced before assignment in enclosing scope
Wildcards:
Traceback:
File "/home/nasiegel/2022-h1n1/Snakefile", line 131, in aggregate_decompress_h1n1
I assumed it was an issue having to do with the symbolic file paths in my function:
def aggregate_decompress_h1n1(wildcards):
checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]
filenames = expand(
SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
OUTPUTDIR + "{basenames}_quant/quant.sf",
basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
return filenames
However, hardcoding the paths does not resolve the issue. I've attached the full Snakefile below any advice would be appreciated.
Original file
# Snakemake file - input raw reads to generate quant files for analysis in R
configfile: "config.yaml"
import io
import os
import pandas as pd
import pathlib
from snakemake.exceptions import print_exception, WorkflowError
#----SET VARIABLES----#
PROJ = config["proj_name"]
INPUTDIR = config["raw-data"]
SCRATCH = config["scratch"]
REFERENCE = config["ref"]
OUTPUTDIR = config["outputDIR"]
# Adapters
SE_ADAPTER = config['seq']['SE']
SE_SEQUENCE = config['seq']['trueseq-se']
# Organsim
TRANSCRIPTOME = config['transcriptome']['rhesus']
SPECIES = config['species']['rhesus']
SAMPLE_LIST = glob_wildcards(INPUTDIR + "{basenames}_R1.fastq.gz").basenames
rule all:
input:
"final.txt",
# dowload referemce files
REFERENCE + SE_ADAPTER,
REFERENCE + SPECIES,
# multiqc
SCRATCH + "fastqc/raw_multiqc.html",
SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
SCRATCH + "fastqc/trimmed_multiqc.html",
SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
rule download_trimmomatic_adapter_file:
output: REFERENCE + SE_ADAPTER
shell: "curl -L -o {output} {SE_SEQUENCE}"
rule download_transcriptome:
output: REFERENCE + SPECIES
shell: "curl -L -o {output} {TRANSCRIPTOME}"
rule download_data:
output: "high_quality_files.tgz"
shell: "curl -L -o {output} https://osf.io/pcxfg/download"
checkpoint decompress_h1n1:
output: directory(INPUTDIR)
input: "high_quality_files.tgz"
shell: "tar xzvf {input}"
rule fastqc:
input: INPUTDIR + "{basenames}_R1.fastq.gz"
output:
raw_html = SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
raw_zip = SCRATCH + "fastqc/{basenames}_R1_fastqc.zip"
conda: "env/rnaseq.yml"
wrapper: "0.80.3/bio/fastqc"
rule multiqc:
input:
raw_qc = expand(SCRATCH + "fastqc/{basenames}_R1_fastqc.zip", basenames=SAMPLE_LIST),
trim_qc = expand(SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip", basenames=SAMPLE_LIST)
output:
raw_multi_html = SCRATCH + "fastqc/raw_multiqc.html",
raw_multi_stats = SCRATCH + "fastqc/raw_multiqc_general_stats.txt",
trim_multi_html = SCRATCH + "fastqc/trimmed_multiqc.html",
trim_multi_stats = SCRATCH + "fastqc/trimmed_multiqc_general_stats.txt"
conda: "env/rnaseq.yml"
shell:
"""
multiqc -n multiqc.html {input.raw_qc} #run multiqc
mv multiqc.html {output.raw_multi_html} #rename html
mv multiqc_data/multiqc_general_stats.txt {output.raw_multi_stats} #move and rename stats
rm -rf multiqc_data #clean-up
#repeat for trimmed data
multiqc -n multiqc.html {input.trim_qc} #run multiqc
mv multiqc.html {output.trim_multi_html} #rename html
mv multiqc_data/multiqc_general_stats.txt {output.trim_multi_stats} #move and rename stats
rm -rf multiqc_data #clean-up
"""
rule trimmmomatic_se:
input:
reads= INPUTDIR + "{basenames}_R1.fastq.gz",
adapters= REFERENCE + SE_ADAPTER,
output:
reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
unpaired = SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz"
conda: "env/rnaseq.yml"
log: SCRATCH + "logs/fastqc/{basenames}_R1_trim_unpaired.log"
shell:
"""
trimmomatic SE {input.reads} \
{output.reads} {output.unpaired} \
ILLUMINACLIP:{input.adapters}:2:0:15 LEADING:2 TRAILING:2 \
SLIDINGWINDOW:4:2 MINLEN:25
"""
rule fastqc_trim:
input: SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz"
output:
html = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
zip = SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip"
log: SCRATCH + "logs/fastqc/{basenames}_R1_trimmed.log"
conda: "env/rnaseq.yml"
wrapper: "0.35.2/bio/fastqc"
rule salmon_quant:
input:
reads = SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
index_dir = OUTPUTDIR + "quant/sc_ensembl_index"
output: OUTPUTDIR + "{basenames}_quant/quant.sf"
params: OUTPUTDIR + "{basenames}_quant"
log: SCRATCH + "logs/salmon/{basenames}_quant.log"
conda: "env/rnaseq.yml"
shell:
"""
salmon quant -i {input.index_dir} --libType A -r {input.reads} -o {params} --seqBias --gcBias --validateMappings
"""
def aggregate_decompress_h1n1(wildcards):
checkpoint_output = checkpoints.decompress_h1n1.get(**wildcards).output[0]
filenames = expand(
SCRATCH + "fastqc/{basenames}_R1_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_fastqc.zip",
SCRATCH + "trimmed/{basenames}_R1_trim.fastq.gz",
SCRATCH + "trimmed/{basenames}_R1.unpaired.fastq.gz",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.html",
SCRATCH + "fastqc/{basenames}_R1_trimmed_fastqc.zip",
OUTPUTDIR + "{basenames}_quant/quant.sf",
basenames = glob_wildcards(os.path.join(checkpoint_output, "{basenames}_R1.fastq.gz")).basenames)
return filenames
rule salmon_index:
input: REFERENCE + SPECIES
output: directory(OUTPUTDIR + "quant/sc_ensembl_index")
conda: "env/rnaseq.yml"
shell: "salmon index --index {output} --transcripts {input} # --type quasi"
rule done:
input: aggregate_decompress_h1n1
output: "final.txt"
shell: "touch {output}"
I think it's due to you used expand function in a wrong way, expand only accepts two positional arguments, where the first one is pattern and the second one is function (optional). If you want to supply multiple patterns you should wrap these patterns in list.
After some studying on source code of snakemake, it turns out expand function doesn't check if user provides < 3 positional arguments, there is a variable combinator in if-else that would only be created when there are 1 or 2 positional arguments, the massive amount of positional arguments you provide skip this part and lead to the error when it tries to use combinator later.
Source code: https://snakemake.readthedocs.io/en/v6.5.4/_modules/snakemake/io.html
Related
I plan to implement a pipeline where I search for specific transcripts at three different genomes to align the best hits and estimate some statistics by each.
To automatize the task in Snakemake, I run blat given the genomes sequence. Nonetheless, at some point, the pipeline needs to use the output transcripts from blat as the subsequent inputs. The problem is that I don't know which transcripts will be output by the checkpoint get_transcripts (see below). Does someone know how to read the directory containing the transcripts and use them as parallel inputs for the next steps? I tried to implement a function to read the path, but then the rule macse_align (see bellow) gets as input a list of files, and Snakemake does not iterate by transcripts name but instead tries to execute the rule using all the input at once.
I have checked similar post, but the solution usually use the list of file inside a directory as an input list for the next rule (e.g: use directories or all files in directories as input in snakemake)
Here is my code
import os
import glob
configfile: 'config.yaml'
ROOT = os.path.abspath('genomes') + '/'
for d in ['pslx','info','genes','annotations']:
os.makedirs(ROOT + d,exist_ok=True)
rule all:
input:
outgroups = expand(ROOT+config['FOCAL'] + '_{outgroup}.fa',outgroup=config['OUTGROUPS']),
"genomes/genes/",
[f+'/'+f.split('/')[-1] + '_NT.fa' for f in glob.glob('genomes/genes/*')]
rule parse_cds:
input:
ROOT + config['PREFIX'] + 'cds.all.fa.gz'
output:
ROOT + config['FOCAL'] + '_cds.fa'
shell:
"""bioawk -c fastx '{{print ">"$name"\\n"$seq}}' {input} > {output}"""
rule pblat:
input:
cds = ROOT + config['FOCAL'] + '_cds.fa',
genome = ROOT + "{outgroup}.fa.gz"
output:
ROOT + 'pslx/' + config['FOCAL'] + "_{outgroup}.pslx"
threads:
config['THREADS']
shell:
"""
pblat {input.genome} {input.cds} -t=dna -q=dna -minIdentity=60 -fine -threads={threads} -out=pslx {output}
"""
rule pslx_reps:
input:
ROOT + 'pslx/' + config['FOCAL'] + "_{outgroup}.pslx"
output:
pslx=ROOT + 'pslx/' + config['FOCAL'] + "_{outgroup}_reps.pslx",
psr=ROOT + 'pslx/' + config['FOCAL'] + "_{outgroup}_reps.psr"
shell:
"pslReps -nohead {input} {output.pslx} {output.psr}"
rule pslx_info:
input:
ROOT + "pslx/" + "human_{outgroup}_reps.pslx"
output:
fa = ROOT + "human_{outgroup}.fa",
info = ROOT + "info/" + "human_{outgroup}.info"
shell:
"perl scripts/pslx_to_fasta.pl --pslx {input} --fasta {output.fa} --info {output.info}"
checkpoint get_transcripts:
input:
outgroups = expand(ROOT+config['FOCAL'] + '_{outgroup}.fa',outgroup=config['OUTGROUPS']),
infos = expand(ROOT+'info/{outgroup}_back.info',outgroup=config['OUTGROUPS']),
focal = ROOT + config['FOCAL'] + '_cds.fa',
output:
directory(ROOT+'genes/')
params:
species_dict=config["OUTGROUPS"],
distance=config['DISTANCE'],
gtf = ROOT + config['GTF']
script:
'scripts/test.py'
# input function for rule macse_align, return paths to all files produced by the checkpoint 'somestep'
def list_transcripts(wildcards):
checkpoint_output = checkpoints.get_transcripts.get(**wildcards).output[0]
in_dir = glob.glob(checkpoint_output+'/*/*')
return [i + "/" + i.split('/')[-1] + '.fa' for i in in_dir]
rule macse_align:
input:
list_transcripts
output:
[f+'/'+f.split('/')[-1] + '_NT.fa' for f in glob.glob('genomes/genes/*')]
shell:
"""
java -Xmx8G -jar scripts/macse_v2.06.jar -prog alignSequences -seq {input} -out_NT {output}
"""
I'm a beginner with coding and Snakemake and I'm really struggling to understand my problem. Running the snakefile below will produce no error. But it does not execute the Bowtie rule. After using --dryrun it will show:
Building DAG of jobs... Nothing to be done.
My guess would be that I mixed something up with the wildcards and Snakemake thinks the file already exist so it does not execute the rule at all. The rule does work when I hard code it. I tried to change the wildcards but can't get it to run.
#Snakefile
configfile: "../../config/config.yaml"
INPUTDIR = str(config["paths"]["input"])
OUTPUTDIR = str(config["paths"]["output"])
FILE_FORMAT = str(config["paths"]["file_format"])
DATABASEPATH = str(config["database"]["Bowtie_Database"])
(SAMPLES, NUMBERS) = glob_wildcards(INPUTDIR + "/{sample}_{number, [1,2]}." + FILE_FORMAT)
DATABASE, = glob_wildcards(DATABASEPATH + "/{bowtie_ref}")
#Outputfiles
rule all:
input:
#FastQC raw
expand(OUTPUTDIR + "/FastQC/raw/{sample}_{number}_fastqc.html", sample=SAMPLES,number=NUMBERS),
expand(OUTPUTDIR + "/FastQC/raw/{sample}_{number}_fastqc.zip", sample=SAMPLES,number=NUMBERS),
#Bowtie output
expand(OUTPUTDIR + "/Bowtie/{bowtie_ref}_{sample}.sam", bowtie_ref=DATABASE,sample=SAMPLES),
#macs2 output
#"/home/henri/MPI/Pipeline/Mus/results/Macs2/eg2.bed"
#######
# Q C #
#######
#Quality Control for raw data with FastQC
rule qc_raw_fastqc:
params:
threads = config["threads"]
conda:
"envs/fastqc.yml"
input:
INPUTDIR + "/{sample}_{number}." + FILE_FORMAT
output:
html = OUTPUTDIR + "/FastQC/raw/{sample}_{number}_fastqc.html",
zip = OUTPUTDIR + "/FastQC/raw/{sample}_{number}_fastqc.zip"
message:
"Doing quality control for raw reads with FastQC"
shell:
"fastqc -o {config[paths][output]}/FastQC/raw {input}"
################
## B O W T I E #
################
#mapping on ref. genome with Bowtie2
#rule Bowtie:
params:
threads = config["threads"]
conda:
"envs/fastqc.yml"
input:
expand(INPUTDIR + "/Bowtie_Database/{{bowtie_ref}}{ending}", ending=[".1.bt2",".2.bt2",".3.bt2",".4.bt2",".rev.1.bt2",".rev.2.bt2"]),
R1 = INPUTDIR + "{sample}.fastq",
R2 = INPUTDIR + "{sample}.fastq"
output:
OUTPUTDIR + "/Bowtie/{bowtie_ref}_{sample}.sam"
message:
"Alignment with Bowtie2 this will take a while"
shell:
"bowtie2 -x {INPUTDIR}/{wildcards.bowtie_ref} -1 {input.R1} -2 {input.R2} -S {output}"
Any help or Ideas would be really appreciated, thank you!
#DmitryKuzminov is probably right - the output files exist and they are newer than the input.
You can force the re-execution of rule Bowtie (and everything that depends on its output) with:
snakemake --forcerun Bowtie ...
Is there a way to access MALLET's diagnostics file or its content by using the provided API via Gensim in Python?
Seems like there is no possibility.
I solved this issue by running MALLET in the command line via Python's subprocess module:
import subprocess
from pathlib import Path
MALLET_PATH = r"C:\mallet" # set to where your "bin/mallet" path is
seglen = 500
topic_count = 20
start = 0
iterations = 20
num_threads = 10 # determines threads used for parallel training
# remember to change backslashes if needed
wdir = Path("../..")
corpusdir = wdir.joinpath("5_corpus", f"seglen-{seglen}")
corpusdir.mkdir(exist_ok=True, parents=True)
mallet_dir = wdir.joinpath("6_evaluation/models/mallet", f"seglen-{seglen}")
topic_dir = mallet_dir.joinpath(f"topics-{topic_count}")
def create_input_files():
# create MALLETs input files
for file in corpusdir.glob("*.txt"):
output = mallet_dir.joinpath(f"{file.stem}.mallet")
# doesn't need to happen more than once -- usually.
if output.is_file(): continue
print(f"--{file.stem}")
cmd = f"bin\\mallet import-file " \
f"--input {file.absolute()} " \
f"--output {output.absolute()} " \
f"--keep-sequence"
subprocess.call(cmd, cwd=MALLET_PATH, shell=True)
print("import finished")
def modeling():
# start modeling
for file in mallet_dir.glob("*.mallet"):
for i in range(start, iterations):
print("iteration ", str(i))
print(f"--{file.stem}")
# output directory
formatdir = topic_dir.joinpath(f"{file.stem.split('-')[0]}")
outputdir = formatdir.joinpath(f"iteration-{i}")
outputdir.mkdir(parents=True, exist_ok=True)
outputdir = str(outputdir.absolute())
# output files
statefile = outputdir + r"\topic-state.gz"
keysfile = outputdir + r"\keys.txt"
compfile = outputdir + r"\composition.txt"
diagnostics_xml = outputdir + r"\diagnostics.xml"
# building cmd string
cmd = f"bin\\mallet train-topics " \
f"--input {file.absolute()} " \
f"--num-topics {topic_count} " \
f"--output-state {statefile} " \
f"--output-topic-keys {keysfile} " \
f"--output-doc-topics {compfile} " \
f"--diagnostics-file {diagnostics_xml} " \
f"--num-threads {num_threads}"
# call mallet
subprocess.call(cmd, cwd=MALLET_PATH, shell=True)
print("models trained")
#create_input_files()
modeling()
I am writing a script that allows a user to put information in a text file such as
Alice;McCormick;ballstate;2000;3457
using this format
FirstName;LastName;Password;UID;GID
import os
import hashlib
iFile = open(“NewUsers.txt”, “rt”)
fileContents = iFile.readlines()
Username =””
fname=””
lname=””
password=""
uid=""
gid=""
for line in fileContents:
items = line.split(‘;’)
fname = items[0].lower()
lname = items[1].lower()
username = fname[0]+lname[0:7]
password = hashlib.sha256(items[2]).hexdigest()
uid = items[3]
gid = items[4]
os.system("/usr/sbin/useradd -p " + password + " -u " + uid + " -g " + gid + username)
I created a group called 3000 that has a groupid of 3457 so it already exists. When I run the script I get the following output.
Usage: useradd [options] LOGIN
useradd -D
useradd -D [options]
Options: lists out all the available options you can use when doing useradd
below that I receive sh: 2: amccormi: not found
I have never saved a command line as a variable do you just save it as a string?
You're passing a string to os.system, so sure, you can just save that to a string:
cmd = "/usr/sbin/useradd -p " + password + " -u " + uid + " -g " + gid + username
And then you can print it out:
print "running command:", cmd
Before passing it to os.system:
os.system(cmd)
What you will find, ultimately, is that when you do this:
>>> iFile = open('data')
>>> fileContents = iFile.readlines()
>>> for line in fileContents:
... items = line.split(';')
... print items
The variable items will end up containing the following:
['Alice', 'McCormick', 'ballstate', '2000', '3457\n']
Look at the final item in that list, which is 3457\n. It contains a
newline character, which means when you build your command line like
this:
os.system("/usr/sbin/useradd -p " + password + " -u " + uid + " -g " + gid + username)
You end up passing the following to /bin/sh:
/usr/bin/useradd -p ballstate -u 2000 -g 3457
amccormi
Hopefully at this point it's clear why you're getting the error that
you've described.
There are a couple of ways to solve this problem. The simplest is
probably to call line.strip(), which will remove whitespace
--including newlines -- at the beginning and end of your string:
>>> for line in fileContents:
... items = line.strip().split(';')
I am having trouble calling an EMBOSS program (which runs via command line) called sixpack through Python.
I run Python via Windows 7, Python version 3.23, Biopython version 1.59, EMBOSS version 6.4.0.4. Sixpack is used to translate a DNA sequence in all six reading frames and creates two files as output; a sequence file identifying ORFs, and a file containing the protein sequences.
There are three required arguments which I can successfully call from command line: (-sequence [input file], -outseq [output sequence file], -outfile [protein sequence file]). I have been using the subprocess module in place of os.system as I have read that it is more powerful and versatile.
The following is my python code, which runs without error but does not produce the desired output files.
from Bio import SeqIO
import re
import os
import subprocess
infile = input('Full path to EXISTING .fasta file would you like to open: ')
outdir = input('NEW Directory to write outfiles to: ')
os.mkdir(outdir)
for record in SeqIO.parse(infile, "fasta"):
print("Translating (6-Frame): " + record.id)
ident=re.sub("\|", "-", record.id)
print (infile)
print ("Old record ID: " + record.id)
print ("New record ID: " + ident)
subprocess.call (['C:\memboss\sixpack.exe', '-sequence ' + infile, '-outseq ' + outdir + ident + '.sixpack', '-outfile ' + outdir + ident + '.format'])
print ("Translation of: " + infile + "\nWritten to: " + outdir + ident)
Found the answer.. I was using the wrong syntax to call subprocess. This is the correct syntax:
subprocess.call (['C:\memboss\sixpack.exe', '-sequence', infile, '-outseq', outdir + ident + '.sixpack', '-outfile', outdir + ident + '.format'])