adding new version 1.5.0 of files

SouthGreenPlatform · Jun 21, 2021 · 6e072d8 · 6e072d8
1 parent e68a261
commit 6e072d8
Show file tree

Hide file tree

Showing 60 changed files with 8,253 additions and 0 deletions.
diff --git a/AdditionalScripts/controllingMapping.py b/AdditionalScripts/controllingMapping.py
@@ -0,0 +1,198 @@
+#!/usr/bin/python3.5
+# -*- coding: utf-8 -*-
+# @package controllingMapping.py
+# @author Francois Sabot
+
+"""
+	Controlling mapping data for a de novo assembly
+	==========================
+	:author:  François Sabot (thanks to Julie Orjuela Scripts help)
+	:contact: [email protected]
+	:date: 04/04/2019
+	:version: 0.1
+	Script description
+	------------------
+	controllingMapping.py will take a BAM file issued from the mapping of Illumina data on a de novo assembly and try to catch the misassembled contigs based on local depth and local mapping errors
+	-------
+	>>> controllingMapping.py -i mapping.bam -o outputPrefix [-s windowSize -t tempLocation]
+	
+	Help Programm
+	-------------
+	information arguments:
+		- \-h, --help
+						show this help message and exit
+		- \-v, --version
+						display controllingMapping.py version number and exit
+	Input mandatory infos for running:
+		- \-i <filename>, --input <filename>
+						BAM file issued from mapping (must be indexed)
+		- \-o <filename>, --out <filename>
+						Prefix of output files
+		- \-s <windowSize>, --size <windowSize>
+						window scan size in bases (Optional, default 1000)
+		- \-t <tempLocation>, --size <tempLocation>
+						Location for temp file, default /tmp
+"""
+
+##################################################
+## Modules
+##################################################
+import sys, os, subprocess, re
+current_dir = os.path.dirname(os.path.abspath(__file__))+"/"
+
+## Python modules
+import argparse
+from time import localtime, strftime
+import pysam
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+##################################################
+## Variables Globales
+##################################################
+version="0.1"
+VERSION_DATE='04/04/2018'
+debug="False"
+#debug="True"
+
+##################################################
+## Functions
+##################################################
+def checkParameters (arg_list):
+	# Check input related options
+	if (not arg_list.inputFile):
+		print ('Error: No input file defined via option -i/--input !' + "\n")
+		parser.print_help()
+		exit()
+	if (not arg_list.outputPrefix):
+		print ('Error: No output prefix file defined via option -o/--output !' + "\n")
+		parser.print_help()
+		exit()
+
+def relativeToAbsolutePath(relative): 
+	from subprocess import check_output
+	if relative[0] != "/":			# The relative path is a relative path, ie do not starts with /
+		command = "readlink -m "+relative
+		absolutePath = subprocess.check_output(command, shell=True).decode("utf-8").rstrip()
+		return absolutePath
+	else:							# Relative is in fact an absolute path, send a warning
+		absolutePath = relative;
+		return absolutePath
+
+def multicov(bedFile,bamFile,optionsList=("")):
+	'''Commentaires'''
+	optionsTxt = "".join(optionsList)
+	command = "bedtools multicov " + optionsTxt + " -bams " + bamFile + " -bed " + bedFile
+	#print (command)
+	try:
+		optionsTxt = str(OptionsTxt)
+		resultCom = os.popen(command,"r")
+	except NameError:
+		resultCom = os.popen(command,"r")
+
+	hashCov = {}
+	for line in resultCom:
+		#print(line)
+		mainLine = line.strip()
+		fields = mainLine.split("\t")
+		start = int(fields[1])
+		stop = int(fields[2])
+		position = start + int(stop - start/2)
+		try:
+			hashCov[fields[0]].append((position,int(fields[3])))
+		except KeyError:
+			hashCov[fields[0]] = [(position,int(fields[3]))]
+		#break
+		#sys.exit()
+	return hashCov	
+
+##################################################
+## Main code
+##################################################
+if __name__ == "__main__":
+	# Parameters recovery
+	parser = argparse.ArgumentParser(prog='controllingMapping.py', description='''This Program identifies the misassemblies in contigs using BAM, and uses BedTools2''')
+	parser.add_argument('-v', '--version', action='version', version='You are using %(prog)s version: ' + version, help=\
+						'display parsingSniffles.py version number and exit')
+	filesreq = parser.add_argument_group('Input mandatory infos for running')
+	filesreq.add_argument('-i', '--input', metavar="<filename>", required=True, dest = 'inputFile', help = 'vcf file')
+	filesreq.add_argument('-o', '--out', metavar="<filename>", required=True, dest = 'outputPrefix', help = 'Prefix for the output files')
+
+	parser.add_argument('-s', '--size', metavar="<sizeInBP>", required=False, default=1000, type=int, dest = 'windowSize', help = 'Window size for scanning')
+	parser.add_argument('-t', '--temp', metavar="<diskLocation>", required=False, default="/tmp", dest = 'tempLocation', help = 'Location of temp directory')
+
+
+
+
+	# Check parameters
+	args = parser.parse_args()
+	checkParameters(args)
+
+	#Welcome message
+	print("#################################################################")
+	print("# 	Welcome in controllingMapping (Version " + version + ")	   	#")
+	print("#################################################################")
+
+	#Window size for scanning
+	windowSize=args.windowSize
+	#Temp location
+	tempLocation=args.tempLocation
+
+
+	# From relative to absolute paths 
+	inputFile = relativeToAbsolutePath(args.inputFile)
+	outputPrefix = relativeToAbsolutePath(args.outputPrefix)
+
+	#open output handle
+	outputFile = outputPrefix + "_position.csv"
+	outputHandle = open(outputFile, "w")
+
+	#Temp bed opening
+	tempBed = tempLocation + "/temp.bed"
+	tempHandle = open(tempBed, "w")
+
+	#PySam import
+	bamFile = pysam.AlignmentFile(inputFile,"rb");
+
+	#Picking up infos on sequences
+	sizes=bamFile.lengths
+	names=bamFile.references
+	seqInfo = {a:b for a,b in zip(names,sizes)}
+	bamFile.close()
+
+	for seq in seqInfo.keys():
+		for window in range(1,seqInfo[seq],windowSize):
+			start = str(window)
+			stop = window + windowSize
+			if stop > seqInfo[seq]:
+				stop = seqInfo[seq] 
+			stop = str(stop)
+			tempHandle.write(seq + "\t" + start + "\t" + stop + "\n")
+
+	depthTotal = multicov(tempBed, inputFile)
+	options = ("-p")
+	depthOk = multicov(tempBed, inputFile, options)
+
+	#Creating the dataframe
+	for contigs in depthTotal.keys():
+		depthInfo={a:b for a,b in depthTotal[contigs]}
+		okInfo={a:b for a,b in depthOk[contigs]}
+		globalInfo={}
+		for position in depthInfo.keys():
+			globalInfo[position]=[depthInfo[position],okInfo[position]]
+		df =pd.DataFrame.from_dict(globalInfo, orient='index',columns=['Depth','Ok'])
+		df = df.sort_index()
+		#print(df)
+		somme = df['Depth'].sum()
+		if somme > 0:
+			plt.figure()
+			ax = plt.gca()
+			df.plot(kind='line',y='Depth', ax=ax,title=contigs)
+			df.plot(kind='line',y='Ok',color='red',ax=ax)
+			filename = contigs + ".png"
+			plt.savefig(filename)
+		else:
+			continue
+
+	sys.exit()
diff --git a/AdditionalScripts/makerange.py b/AdditionalScripts/makerange.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+import sys
+import argparse
+from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+"""
+split a genome into a set of overlapping segments
+adapted from nanopolish_makerange.py https://github.com/jts/nanopolish to CulebrONT 
+"""
+# DRAFT = snakemake.input.draft
+# SEGMENT_LENGTH = snakemake.params.segment_len if not snakemake.params.segment_len == '' else 50000
+# OVERLAP_LENGTH = snakemake.params.overlap_len if not snakemake.params.overlap_len == '' else 200
+# MIN_SEGMENT_LENGTH = 5 * OVERLAP_LENGTH
+# OUTPUTFILE = snakemake.output.segments_list
+
+parser = argparse.ArgumentParser(description='Partition a genome into a set of overlapping segments')
+parser.add_argument('--draft', help="draft to segment")
+parser.add_argument('--segment-length', type=int, default=50000)
+parser.add_argument('--overlap-length', type=int, default=200)
+parser.add_argument('--output-file', help="output file")
+args = parser.parse_args()
+
+DRAFT = args.draft
+SEGMENT_LENGTH = args.segment_length
+OVERLAP_LENGTH = args.overlap_length
+OUTPUT_FILE = args.output_file
+MIN_SEGMENT_LENGTH = 5 * OVERLAP_LENGTH
+
+with open(DRAFT, "r") as draft:
+    recs = [(title.split(None, 1)[0], len(seq))
+            for title, seq in SimpleFastaParser(draft)]
+
+with open(OUTPUT_FILE, "w") as output:
+    for name, length in recs:
+        n_segments = (length / SEGMENT_LENGTH) + 1
+        start = 0
+        while start < length:
+            end = start + SEGMENT_LENGTH
+            # If this segment will end near the end of the contig, extend it to end
+            if length - end < MIN_SEGMENT_LENGTH:
+                output.write(f"{name}:{start}-{length-1}\n")
+                start = length
+            else:
+                nlen = end + OVERLAP_LENGTH
+                output.write(f"{name}:{start}-{nlen}\n")
+                start = end
diff --git a/AdditionalScripts/rotateCircSeqs.R b/AdditionalScripts/rotateCircSeqs.R
@@ -0,0 +1,129 @@
+
+## Read Fasta file of a circular sequence, rotate it,
+## and save the resulting sequences in a new fasta file.
+
+#### Loading required packages #####
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library(Biostrings))
+
+#### COMMAND LINE ARGUMENTS PARSING ######
+option_list <- list(
+  make_option(c("-f", "--seqFile"),
+              type = "character",
+              default = NULL,
+              help = "Path of fasta file of seqences to be rotated."),
+  make_option(c("-o", "--outFilePath"),
+              type = "character",
+              default = NULL,
+              help = "Path where output sequence file will be written."),
+  make_option(c("-d", "--circlatorLog"),
+              type = "character",
+              default = NULL,
+              help = "Path to a circlator '04.merge.circularise.log' file to be use to rotate only circularized sequences.")
+)
+
+##### RETRIEVEING PARAMS from optparse #####
+myArgs <- parse_args(
+  OptionParser(usage = "%prog [options]", option_list = option_list,
+               description = "Rotate sequences in an input fasta file. Sequences will be rotated if the sequence title contains a 'circular' flag, they will be rotated and their title will be appended with a 'rotated' suffix. If a circlator.log file is provided and no info is available in the titles, the sequences described as circular in the log file will be rotated and their title will be appended with a 'circular|rotated' suffix. Sequences that do not meet these criteria are left untouched.")
+)
+
+###########################################
+##### FOR INTERACTIVE ARGUMENTS PASSING TESTS
+# testArgs  <-  c(
+#   "--seqFile=/home/cunnac/Lab-Related/MyScripts/pacbacpipe/tests/tagCirSeq/talCmut_canu/06.fixstartcircFlagged.fasta",
+#   "--outFilePath=/home/cunnac/rotateSeqTest.fasta",
+#   "--circlatorLog=/home/cunnac/Lab-Related/MyScripts/pacbacpipe/tests/tagCirSeq/talCmut_canu/04.merge.circularise.log"
+# )
+# myArgs <- parse_args(OptionParser(usage = "%prog", option_list = option_list), args = testArgs)
+###########################################
+
+# Assign parameter values
+seqFile <- myArgs$seqFile
+outSeqFile <- myArgs$outFilePath
+if(is.null(outSeqFile)) outSeqFile <- paste0(gsub("(^.*)\\..*$", "\\1", seqFile), "_rotated.fasta")
+circlatorLog <- myArgs$circlatorLog
+circLogProvided <- ifelse(is.null(circlatorLog) || circlatorLog == "", FALSE, TRUE)
+
+# This is the number by which seq lenght is devided to determine length of offset
+n <- 5
+
+# Loading sequences and determining if any is flagged as circular
+seq <- readDNAStringSet(seqFile) #seq <- DNAStringSet(c("AAATTTGGGCCCNNN", "AAAATTTTCCCC"))
+circularInName <- grepl("circular", names(seq), ignore.case = TRUE)
+
+if (circLogProvided && any(circularInName)) { # if log and flagged seq, sanity check on names
+  circlatorDf <- read.delim(circlatorLog, stringsAsFactors = FALSE)
+  seqNamePrefixes <- sapply(names(seq), function(x) {strsplit(x, split = "[_ ]")[[1]][1]})
+  circNamePrefixes <- sapply(circlatorDf$X.Contig, function(x) {strsplit(x, split = "[_ ]")[[1]][1]})
+  if(!identical(length(setdiff(seqNamePrefixes, circNamePrefixes)), 0L)) {
+    stop("Sequence names in sequence file and circlatorLog do not match:\n",
+         "Names in fasta file: ", names(seqNamePrefixes), "\n",
+         "Names in circlator log: ", names(circNamePrefixes), "\n")
+  } else {
+    cat("## Provided a circlator log file and sequences in fasta file are also otherwise flagged as circular...\n",
+        "## Circular flags in fasta sequence titles will prevail to select sequences that are rotated.", sep = "")
+  }
+}
+
+# Deciding on what will be rotated
+if (any(circularInName)) { # if some seqs have 'circular' in their name, flagged seq prevails regardless of log
+  circSeq <- seq[circularInName]
+  linearSeq <- seq[!circularInName]
+} else if (circLogProvided && !any(circularInName)) { # if log and no flagged seq, log prevails
+  circlatorDf <- read.delim(circlatorLog, stringsAsFactors = FALSE)
+  circSeq <- seq[circlatorDf[circlatorDf$circularised == 1, "X.Contig"]]
+  linearSeq <- seq[circlatorDf[circlatorDf$circularised == 0, "X.Contig"]]
+}  else { # No seqs will be rotated!!!
+  warning("No 'circular' hit in sequence names and no circlatorLog file provided either, NONE of the sequences will be rotated!!!!.")
+  circSeq <- DNAStringSet()
+  linearSeq <- seq
+}
+
+# Rotating circular sequences
+newCircSeq <- DNAStringSet(
+  sapply(X = circSeq, FUN = function(x) {
+    seqLength <- nchar(x)
+    newStartPos <- round(seqLength/n)
+    c(subseq(x, start = newStartPos, end = seqLength), subseq(x, end = newStartPos-1, width = newStartPos-1))
+  })
+)
+# Modify names of rotated circular sequences if necessary
+if (length(circSeq) != 0) {
+  names(newCircSeq) <- if (any(circularInName)) {
+    paste(names(newCircSeq), "rotated", sep = "_")
+  } else if (circLogProvided && !any(circularInName)) {
+    paste(names(newCircSeq), "circular_rotated", sep = "_")
+  }
+}
+# Building output seq set
+newSeq <- c(linearSeq, newCircSeq)
+
+# Sanity check
+if(!identical(length(setdiff(nchar(seq), nchar(newSeq))), 0L)) stop("Rotated sequence has a different length than the original one!")
+# Save output
+writeXStringSet(x = newSeq, filepath = outSeqFile, compress = FALSE)
+
+# Log messages
+summaryDf <- data.frame(
+  seqNames = names(newSeq),
+  seqLength = nchar(newSeq),
+  StartAtFormerPosition = round(nchar(newSeq)/n)
+)
+cat("\n")
+cat("***********************************************************************************************\n")
+if (!circLogProvided & !any(circularInName)) {
+  cat("##", date(), ": No 'circular' hit in sequence names and no circlatorLog file provided either, NOTHING has been rotated!!!!")
+  print(knitr::kable(summaryDf[, c("seqNames", "seqLength")]))
+} else if (length(circSeq) != 0) {
+  cat("##", date(), ": Summary of circular sequences rotation in '", seqFile, "' :")
+  print(knitr::kable(summaryDf))
+} else {
+  cat("##", date(), ": No 'circular' hit in sequence names and none of the sequences in '", seqFile, "' were specified as circular in circlatorLog file and nothing has been rotated!")
+  print(knitr::kable(summaryDf[, c("seqNames", "seqLength")]))
+}
+cat("\n")
+cat("All sequences saved in:", outSeqFile, "\n")
+cat("***********************************************************************************************\n")
+
+quit(save = "no")