Skip to content
This repository has been archived by the owner on Jan 31, 2024. It is now read-only.

Commit

Permalink
adding new version 1.5.0 of files
Browse files Browse the repository at this point in the history
  • Loading branch information
julieaorjuela committed Jun 21, 2021
1 parent e68a261 commit 6e072d8
Show file tree
Hide file tree
Showing 60 changed files with 8,253 additions and 0 deletions.
198 changes: 198 additions & 0 deletions AdditionalScripts/controllingMapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/python3.5
# -*- coding: utf-8 -*-
# @package controllingMapping.py
# @author Francois Sabot

"""
Controlling mapping data for a de novo assembly
==========================
:author: François Sabot (thanks to Julie Orjuela Scripts help)
:contact: [email protected]
:date: 04/04/2019
:version: 0.1
Script description
------------------
controllingMapping.py will take a BAM file issued from the mapping of Illumina data on a de novo assembly and try to catch the misassembled contigs based on local depth and local mapping errors
-------
>>> controllingMapping.py -i mapping.bam -o outputPrefix [-s windowSize -t tempLocation]
Help Programm
-------------
information arguments:
- \-h, --help
show this help message and exit
- \-v, --version
display controllingMapping.py version number and exit
Input mandatory infos for running:
- \-i <filename>, --input <filename>
BAM file issued from mapping (must be indexed)
- \-o <filename>, --out <filename>
Prefix of output files
- \-s <windowSize>, --size <windowSize>
window scan size in bases (Optional, default 1000)
- \-t <tempLocation>, --size <tempLocation>
Location for temp file, default /tmp
"""

##################################################
## Modules
##################################################
import sys, os, subprocess, re
current_dir = os.path.dirname(os.path.abspath(__file__))+"/"

## Python modules
import argparse
from time import localtime, strftime
import pysam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##################################################
## Variables Globales
##################################################
version="0.1"
VERSION_DATE='04/04/2018'
debug="False"
#debug="True"

##################################################
## Functions
##################################################
def checkParameters (arg_list):
# Check input related options
if (not arg_list.inputFile):
print ('Error: No input file defined via option -i/--input !' + "\n")
parser.print_help()
exit()
if (not arg_list.outputPrefix):
print ('Error: No output prefix file defined via option -o/--output !' + "\n")
parser.print_help()
exit()

def relativeToAbsolutePath(relative):
from subprocess import check_output
if relative[0] != "/": # The relative path is a relative path, ie do not starts with /
command = "readlink -m "+relative
absolutePath = subprocess.check_output(command, shell=True).decode("utf-8").rstrip()
return absolutePath
else: # Relative is in fact an absolute path, send a warning
absolutePath = relative;
return absolutePath

def multicov(bedFile,bamFile,optionsList=("")):
'''Commentaires'''
optionsTxt = "".join(optionsList)
command = "bedtools multicov " + optionsTxt + " -bams " + bamFile + " -bed " + bedFile
#print (command)
try:
optionsTxt = str(OptionsTxt)
resultCom = os.popen(command,"r")
except NameError:
resultCom = os.popen(command,"r")

hashCov = {}
for line in resultCom:
#print(line)
mainLine = line.strip()
fields = mainLine.split("\t")
start = int(fields[1])
stop = int(fields[2])
position = start + int(stop - start/2)
try:
hashCov[fields[0]].append((position,int(fields[3])))
except KeyError:
hashCov[fields[0]] = [(position,int(fields[3]))]
#break
#sys.exit()
return hashCov

##################################################
## Main code
##################################################
if __name__ == "__main__":
# Parameters recovery
parser = argparse.ArgumentParser(prog='controllingMapping.py', description='''This Program identifies the misassemblies in contigs using BAM, and uses BedTools2''')
parser.add_argument('-v', '--version', action='version', version='You are using %(prog)s version: ' + version, help=\
'display parsingSniffles.py version number and exit')
filesreq = parser.add_argument_group('Input mandatory infos for running')
filesreq.add_argument('-i', '--input', metavar="<filename>", required=True, dest = 'inputFile', help = 'vcf file')
filesreq.add_argument('-o', '--out', metavar="<filename>", required=True, dest = 'outputPrefix', help = 'Prefix for the output files')

parser.add_argument('-s', '--size', metavar="<sizeInBP>", required=False, default=1000, type=int, dest = 'windowSize', help = 'Window size for scanning')
parser.add_argument('-t', '--temp', metavar="<diskLocation>", required=False, default="/tmp", dest = 'tempLocation', help = 'Location of temp directory')




# Check parameters
args = parser.parse_args()
checkParameters(args)

#Welcome message
print("#################################################################")
print("# Welcome in controllingMapping (Version " + version + ") #")
print("#################################################################")

#Window size for scanning
windowSize=args.windowSize
#Temp location
tempLocation=args.tempLocation


# From relative to absolute paths
inputFile = relativeToAbsolutePath(args.inputFile)
outputPrefix = relativeToAbsolutePath(args.outputPrefix)

#open output handle
outputFile = outputPrefix + "_position.csv"
outputHandle = open(outputFile, "w")

#Temp bed opening
tempBed = tempLocation + "/temp.bed"
tempHandle = open(tempBed, "w")

#PySam import
bamFile = pysam.AlignmentFile(inputFile,"rb");

#Picking up infos on sequences
sizes=bamFile.lengths
names=bamFile.references
seqInfo = {a:b for a,b in zip(names,sizes)}
bamFile.close()

for seq in seqInfo.keys():
for window in range(1,seqInfo[seq],windowSize):
start = str(window)
stop = window + windowSize
if stop > seqInfo[seq]:
stop = seqInfo[seq]
stop = str(stop)
tempHandle.write(seq + "\t" + start + "\t" + stop + "\n")

depthTotal = multicov(tempBed, inputFile)
options = ("-p")
depthOk = multicov(tempBed, inputFile, options)

#Creating the dataframe
for contigs in depthTotal.keys():
depthInfo={a:b for a,b in depthTotal[contigs]}
okInfo={a:b for a,b in depthOk[contigs]}
globalInfo={}
for position in depthInfo.keys():
globalInfo[position]=[depthInfo[position],okInfo[position]]
df =pd.DataFrame.from_dict(globalInfo, orient='index',columns=['Depth','Ok'])
df = df.sort_index()
#print(df)
somme = df['Depth'].sum()
if somme > 0:
plt.figure()
ax = plt.gca()
df.plot(kind='line',y='Depth', ax=ax,title=contigs)
df.plot(kind='line',y='Ok',color='red',ax=ax)
filename = contigs + ".png"
plt.savefig(filename)
else:
continue

sys.exit()
46 changes: 46 additions & 0 deletions AdditionalScripts/makerange.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import sys
import argparse
from Bio.SeqIO.FastaIO import SimpleFastaParser

"""
split a genome into a set of overlapping segments
adapted from nanopolish_makerange.py https://github.com/jts/nanopolish to CulebrONT
"""
# DRAFT = snakemake.input.draft
# SEGMENT_LENGTH = snakemake.params.segment_len if not snakemake.params.segment_len == '' else 50000
# OVERLAP_LENGTH = snakemake.params.overlap_len if not snakemake.params.overlap_len == '' else 200
# MIN_SEGMENT_LENGTH = 5 * OVERLAP_LENGTH
# OUTPUTFILE = snakemake.output.segments_list

parser = argparse.ArgumentParser(description='Partition a genome into a set of overlapping segments')
parser.add_argument('--draft', help="draft to segment")
parser.add_argument('--segment-length', type=int, default=50000)
parser.add_argument('--overlap-length', type=int, default=200)
parser.add_argument('--output-file', help="output file")
args = parser.parse_args()

DRAFT = args.draft
SEGMENT_LENGTH = args.segment_length
OVERLAP_LENGTH = args.overlap_length
OUTPUT_FILE = args.output_file
MIN_SEGMENT_LENGTH = 5 * OVERLAP_LENGTH

with open(DRAFT, "r") as draft:
recs = [(title.split(None, 1)[0], len(seq))
for title, seq in SimpleFastaParser(draft)]

with open(OUTPUT_FILE, "w") as output:
for name, length in recs:
n_segments = (length / SEGMENT_LENGTH) + 1
start = 0
while start < length:
end = start + SEGMENT_LENGTH
# If this segment will end near the end of the contig, extend it to end
if length - end < MIN_SEGMENT_LENGTH:
output.write(f"{name}:{start}-{length-1}\n")
start = length
else:
nlen = end + OVERLAP_LENGTH
output.write(f"{name}:{start}-{nlen}\n")
start = end
129 changes: 129 additions & 0 deletions AdditionalScripts/rotateCircSeqs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@

## Read Fasta file of a circular sequence, rotate it,
## and save the resulting sequences in a new fasta file.

#### Loading required packages #####
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library(Biostrings))

#### COMMAND LINE ARGUMENTS PARSING ######
option_list <- list(
make_option(c("-f", "--seqFile"),
type = "character",
default = NULL,
help = "Path of fasta file of seqences to be rotated."),
make_option(c("-o", "--outFilePath"),
type = "character",
default = NULL,
help = "Path where output sequence file will be written."),
make_option(c("-d", "--circlatorLog"),
type = "character",
default = NULL,
help = "Path to a circlator '04.merge.circularise.log' file to be use to rotate only circularized sequences.")
)

##### RETRIEVEING PARAMS from optparse #####
myArgs <- parse_args(
OptionParser(usage = "%prog [options]", option_list = option_list,
description = "Rotate sequences in an input fasta file. Sequences will be rotated if the sequence title contains a 'circular' flag, they will be rotated and their title will be appended with a 'rotated' suffix. If a circlator.log file is provided and no info is available in the titles, the sequences described as circular in the log file will be rotated and their title will be appended with a 'circular|rotated' suffix. Sequences that do not meet these criteria are left untouched.")
)

###########################################
##### FOR INTERACTIVE ARGUMENTS PASSING TESTS
# testArgs <- c(
# "--seqFile=/home/cunnac/Lab-Related/MyScripts/pacbacpipe/tests/tagCirSeq/talCmut_canu/06.fixstartcircFlagged.fasta",
# "--outFilePath=/home/cunnac/rotateSeqTest.fasta",
# "--circlatorLog=/home/cunnac/Lab-Related/MyScripts/pacbacpipe/tests/tagCirSeq/talCmut_canu/04.merge.circularise.log"
# )
# myArgs <- parse_args(OptionParser(usage = "%prog", option_list = option_list), args = testArgs)
###########################################

# Assign parameter values
seqFile <- myArgs$seqFile
outSeqFile <- myArgs$outFilePath
if(is.null(outSeqFile)) outSeqFile <- paste0(gsub("(^.*)\\..*$", "\\1", seqFile), "_rotated.fasta")
circlatorLog <- myArgs$circlatorLog
circLogProvided <- ifelse(is.null(circlatorLog) || circlatorLog == "", FALSE, TRUE)

# This is the number by which seq lenght is devided to determine length of offset
n <- 5

# Loading sequences and determining if any is flagged as circular
seq <- readDNAStringSet(seqFile) #seq <- DNAStringSet(c("AAATTTGGGCCCNNN", "AAAATTTTCCCC"))
circularInName <- grepl("circular", names(seq), ignore.case = TRUE)

if (circLogProvided && any(circularInName)) { # if log and flagged seq, sanity check on names
circlatorDf <- read.delim(circlatorLog, stringsAsFactors = FALSE)
seqNamePrefixes <- sapply(names(seq), function(x) {strsplit(x, split = "[_ ]")[[1]][1]})
circNamePrefixes <- sapply(circlatorDf$X.Contig, function(x) {strsplit(x, split = "[_ ]")[[1]][1]})
if(!identical(length(setdiff(seqNamePrefixes, circNamePrefixes)), 0L)) {
stop("Sequence names in sequence file and circlatorLog do not match:\n",
"Names in fasta file: ", names(seqNamePrefixes), "\n",
"Names in circlator log: ", names(circNamePrefixes), "\n")
} else {
cat("## Provided a circlator log file and sequences in fasta file are also otherwise flagged as circular...\n",
"## Circular flags in fasta sequence titles will prevail to select sequences that are rotated.", sep = "")
}
}

# Deciding on what will be rotated
if (any(circularInName)) { # if some seqs have 'circular' in their name, flagged seq prevails regardless of log
circSeq <- seq[circularInName]
linearSeq <- seq[!circularInName]
} else if (circLogProvided && !any(circularInName)) { # if log and no flagged seq, log prevails
circlatorDf <- read.delim(circlatorLog, stringsAsFactors = FALSE)
circSeq <- seq[circlatorDf[circlatorDf$circularised == 1, "X.Contig"]]
linearSeq <- seq[circlatorDf[circlatorDf$circularised == 0, "X.Contig"]]
} else { # No seqs will be rotated!!!
warning("No 'circular' hit in sequence names and no circlatorLog file provided either, NONE of the sequences will be rotated!!!!.")
circSeq <- DNAStringSet()
linearSeq <- seq
}

# Rotating circular sequences
newCircSeq <- DNAStringSet(
sapply(X = circSeq, FUN = function(x) {
seqLength <- nchar(x)
newStartPos <- round(seqLength/n)
c(subseq(x, start = newStartPos, end = seqLength), subseq(x, end = newStartPos-1, width = newStartPos-1))
})
)
# Modify names of rotated circular sequences if necessary
if (length(circSeq) != 0) {
names(newCircSeq) <- if (any(circularInName)) {
paste(names(newCircSeq), "rotated", sep = "_")
} else if (circLogProvided && !any(circularInName)) {
paste(names(newCircSeq), "circular_rotated", sep = "_")
}
}
# Building output seq set
newSeq <- c(linearSeq, newCircSeq)

# Sanity check
if(!identical(length(setdiff(nchar(seq), nchar(newSeq))), 0L)) stop("Rotated sequence has a different length than the original one!")
# Save output
writeXStringSet(x = newSeq, filepath = outSeqFile, compress = FALSE)

# Log messages
summaryDf <- data.frame(
seqNames = names(newSeq),
seqLength = nchar(newSeq),
StartAtFormerPosition = round(nchar(newSeq)/n)
)
cat("\n")
cat("***********************************************************************************************\n")
if (!circLogProvided & !any(circularInName)) {
cat("##", date(), ": No 'circular' hit in sequence names and no circlatorLog file provided either, NOTHING has been rotated!!!!")
print(knitr::kable(summaryDf[, c("seqNames", "seqLength")]))
} else if (length(circSeq) != 0) {
cat("##", date(), ": Summary of circular sequences rotation in '", seqFile, "' :")
print(knitr::kable(summaryDf))
} else {
cat("##", date(), ": No 'circular' hit in sequence names and none of the sequences in '", seqFile, "' were specified as circular in circlatorLog file and nothing has been rotated!")
print(knitr::kable(summaryDf[, c("seqNames", "seqLength")]))
}
cat("\n")
cat("All sequences saved in:", outSeqFile, "\n")
cat("***********************************************************************************************\n")

quit(save = "no")
Loading

0 comments on commit 6e072d8

Please sign in to comment.