diff --git a/docs/Manifest.toml b/docs/Manifest.toml index f580a1b..7dc19e0 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.11.1" +julia_version = "1.11.2" manifest_format = "2.0" project_hash = "a737ae23847074aa3f8c65ffa51f2732c1aeb154" @@ -107,9 +107,9 @@ version = "0.9.3" [[deps.Documenter]] deps = ["ANSIColoredPrinters", "AbstractTrees", "Base64", "CodecZlib", "Dates", "DocStringExtensions", "Downloads", "Git", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "MarkdownAST", "Pkg", "PrecompileTools", "REPL", "RegistryInstances", "SHA", "TOML", "Test", "Unicode"] -git-tree-sha1 = "5a1ee886566f2fa9318df1273d8b778b9d42712d" +git-tree-sha1 = "d0ea2c044963ed6f37703cead7e29f70cba13d7e" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "1.7.0" +version = "1.8.0" [[deps.DocumenterVitepress]] deps = ["ANSIColoredPrinters", "Base64", "DocStringExtensions", "Documenter", "IOCapture", "Markdown", "NodeJS_20_jll", "REPL"] @@ -130,15 +130,15 @@ version = "1.6.0" [[deps.ExceptionUnwrapping]] deps = ["Test"] -git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a" +git-tree-sha1 = "d36f682e590a83d63d1c7dbd287573764682d12a" uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4" -version = "0.1.10" +version = "0.1.11" [[deps.Expat_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "1c6317308b9dc757616f0b5cb379db10494443a7" +git-tree-sha1 = "e51db81749b0777b2147fbe7b783ee79045b8e99" uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.6.2+0" +version = "2.6.4+1" [[deps.FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" @@ -164,15 +164,15 @@ version = "1.3.1" [[deps.Git_jll]] deps = ["Artifacts", "Expat_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Libiconv_jll", "OpenSSL_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "ea372033d09e4552a04fd38361cd019f9003f4f4" +git-tree-sha1 = "399f4a308c804b446ae4c91eeafadb2fe2c54ff9" uuid = "f8c6e375-362e-5223-8a59-34ff63f689eb" -version = "2.46.2+0" +version = "2.47.1+0" [[deps.HTTP]] -deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] -git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d" +deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "PrecompileTools", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] +git-tree-sha1 = "6c22309e9a356ac1ebc5c8a217045f9bae6f8d9a" uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "1.10.9" +version = "1.10.13" [[deps.IOCapture]] deps = ["Logging", "Random"] @@ -208,9 +208,9 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.4" [[deps.LazilyInitializedFields]] -git-tree-sha1 = "8f7f3cabab0fd1800699663533b6d5cb3fc0e612" +git-tree-sha1 = "0f2da712350b020bc3957f269c9caad516383ee0" uuid = "0e77f7df-68c5-4e49-93ce-4cd80f5598bf" -version = "1.2.2" +version = "1.3.0" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] @@ -321,9 +321,9 @@ uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" version = "3.0.15+1" [[deps.OrderedCollections]] -git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" +git-tree-sha1 = "12f1439c4f986bb868acda6ea33ebc78e19b95ad" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.3" +version = "1.7.0" [[deps.PCRE2_jll]] deps = ["Artifacts", "Libdl"] diff --git a/docs/make.jl b/docs/make.jl index e5727db..aa1a6d7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,6 +18,7 @@ pgs = [ "The ORF type" => "orftype.md", "Scoring ORFs" => "features.md", "A Simple Coding Rule" => "simplecodingrule.md", + "Ribosome Binding Sites" => "rbs.md", "Writing ORFs In Files" => "iodocs.md", ], "API" => "api.md", diff --git a/docs/src/rbs.md b/docs/src/rbs.md new file mode 100644 index 0000000..0aa3b9f --- /dev/null +++ b/docs/src/rbs.md @@ -0,0 +1,82 @@ +## Ribosome Binding Site (RBS) Motifs + +This document provides information on Ribosome Binding Site (RBS) motifs based on the Prodigal paper. The RBS motifs are categorized by spacer ranges, which indicate the number of base pairs (bp) between the RBS and the start codon. + +### Spacer Ranges and Representative RBS Motifs + +Example sequence with RBS motif: + + Spacer Range | Representative RBS Motif(s) + ------------ | --------------------------- + 3-4 bp | GGA, GAG, AGG, AGxAG, AGGAG + 5-10 bp | GGAG, GAGG, GGxGG, AGGAGG + 11-12 bp | GGA, GAG, AGG, AGxAG, AGGAG + 13-15 bp | GGA, GAG, AGG, AGGA, AGGAG + +### Window Checking Ranges + +Three different window checking ranges are defined to identify potential RBS motifs relative to the start codon (ATG): + +- Window Checking A: +``` + -10 -3 |-> start codon + ...|......|..ATG... +``` + +- Window Checking B: +``` + -16 -5 |-> start codon + ...|..........|....ATG... +``` + +- Window Checking C: +``` + -20 -11 |-> start codon + .|........|..........ATG... +``` + +### RBS Motif Scoring + +The RBS motif scoring is based on the Prodigal paper, which uses a scoring system to identify potential RBS motifs. The scoring system is as follows: + + + +### Example + +An example of an exact search query for an RBS motif is provided: + +- **ExactSearchQuery(dna"ATACG", iscompatible)** returns 5, indicating the position of the motif. + +``` + -20 -15 -10 -5 |-> start codon + |....|....|....|....ATG... + GGAGGACCCCATGACACACACAACAC + |----|:RBS(dna"GGAGGA", 1:5, :A, 27, STRAND_POS) +``` + +To inspect the RBS motifs of a given ORF, we can use the `_findrbs` function. This function returns the RBS motifs of a given ORF: + +```julia + +phi = dna"GTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGCTTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTATATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTGTCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGAATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGCCGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGTTTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTGCTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCTGGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTGGTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGATAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGAGATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGACCAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCAAACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTTCTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGATACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCGTCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTTCTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATTGCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTTATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGCCCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCATCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTACTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGCATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAACCTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTTGATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGCCGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGT ACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGA" + +orf = findorfs(phi)[3] + +GeneFinder._findrbs(orf) + +6-element Vector{RBS}: + RBS(GAG, 90:92, :A, 13, STRAND_POS) + RBS(GGAG, 89:92, :B, 16, STRAND_POS) + RBS(AGG, 88:90, :B, 13, STRAND_POS) + RBS(GGA, 89:91, :B, 13, STRAND_POS) + RBS(GAG, 90:92, :B, 13, STRAND_POS) + RBS(AGGA, 88:91, :B, 16, STRAND_POS) + +``` + +Now, following the scoring scheme mentioned above, we can calculate the RBS score for each motif. This criteria is implemented in the `orf_rbs_score` or simply aliased to `orbs` function: + +```julia +orf_rbs_score(orf) +29 +``` \ No newline at end of file diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 5c670ef..909d8af 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -21,7 +21,10 @@ using BioSequences: reverse_complement, randdnaseq, ncbi_trans_table, - translate + translate, + + ExactSearchQuery, + iscompatible using BioMarkovChains: BioMarkovChain, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using IterTools: takewhile, iterated @@ -31,19 +34,21 @@ using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, Strand, summary include("algorithms/naivefinder.jl") include("algorithms/naivecollector.jl") -# Coding Criteria -include("criteria/lordr.jl") - # Main functions include("types.jl") include("findorfs.jl") -include("iscoding.jl") include("io.jl") # Utils and extended functions include("utils.jl") include("extended.jl") +# RBS Scoring +include("rbs.jl") + +# Coding Criteria +include("criteria.jl") + # Precompiled workloads include("workload.jl") diff --git a/src/criteria/lordr.jl b/src/criteria.jl similarity index 50% rename from src/criteria/lordr.jl rename to src/criteria.jl index d403c7b..8bd5f26 100644 --- a/src/criteria/lordr.jl +++ b/src/criteria.jl @@ -1,4 +1,38 @@ -export log_odds_ratio_decision_rule, lordr #lors +export iscoding, + log_odds_ratio_decision_rule, lordr, + ribsome_binding_site_decision_rule, rbsdr + + + @doc raw""" + iscoding(orf::ORFI{N,F}; criteria::Function = lordr, kwargs...) -> Bool + +Check if the given DNA sequence of an ORF is likely to be coding based on a scoring scheme/function. + +## Scoring Criteria/Functions +- `lordr`: Log-Odds Ratio Decision Rule +- `rbsdr`: Ribosome Binding Site Decision Rule + +``` +phi = dna"GTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAAT...AGTGTTTCCAGTCCGTTCAGTTAATAGTCAGGTTAAAGATAAAAGATTGA" +orfs = findorfs(phi) + +orf = orfs[2] + ORFI{NaiveFinder}(94:126, '-', 1) + +iscoding(orf) # Returns: true +``` +""" +function iscoding( + orf::ORFI{N,F}; + criteria::Function = lordr, #rbsdr + kwargs... +) where {N,F<:GeneFinderMethod} + return criteria(orf; kwargs...) +end + +### Actual criteria functions ### + +## Log-Odds Ratio Decision Rule @doc raw""" log_odds_ratio_decision_rule( @@ -41,15 +75,15 @@ iscoding(sequence) # Returns: true or false ``` """ function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr - sequence::NucleicSeqOrView{DNAAlphabet{N}}; + orf::ORFI{N,F}; modela::BioMarkovChain = ECOLICDS, modelb::BioMarkovChain = ECOLINOCDS, b::Number = 2, η::Float64 = 5e-3 -) where {N} - - scorea = log_odds_ratio_score(sequence; modela=modela, b=b) - scoreb = log_odds_ratio_score(sequence; modela=modelb, b=b) +) where {N,F<:GeneFinderMethod} + orfseq = sequence(orf) + scorea = log_odds_ratio_score(orfseq; modela=modela, b=b) + scoreb = log_odds_ratio_score(orfseq; modela=modelb, b=b) logodds = scorea / scoreb @@ -60,4 +94,33 @@ function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr end end -const log_odds_ratio_decision_rule = lordr # criteria \ No newline at end of file +const log_odds_ratio_decision_rule = lordr # criteria + +## Ribosome Binding Site Decision Rule + +""" + ribsome_binding_site_decision_rule(orf::ORFI{N,F}) where {N,F<:GeneFinderMethod} -> Bool + +Evaluates if an Open Reading Frame (ORF) has a significant ribosome binding site (RBS). + +The function uses the `orf_rbs_score` to calculate a score for the ORF's RBS region and returns +true if the score exceeds a threshold of 9, indicating the presence of at least one RBS. + +# Arguments +- `orf::ORFI{N,F}`: An Open Reading Frame Interface (ORFI) object parameterized by N and F, + where F is a subtype of GeneFinderMethod + +# Returns +- `Bool`: `true` if the RBS score is greater than 9, `false` otherwise + +""" +function rbsdr(orf::ORFI{N,F}) where {N,F<:GeneFinderMethod} + scr = orf_rbs_score(orf) + if scr > 9 # at least one RBS... + return true + else + return false + end +end + +const ribsome_binding_site_decision_rule = rbsdr \ No newline at end of file diff --git a/src/extended.jl b/src/extended.jl index 625cadd..36fd21e 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -14,9 +14,9 @@ import BioSequences: translate ## Methods from BioMarkovChains that expand their fuctions to this package structs import BioMarkovChains: log_odds_ratio_score -export log_odds_ratio_score, lors +# export log_odds_ratio_score, lors @inline log_odds_ratio_score(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_score(sequence(orf); kwargs...) -@inline log_odds_ratio_decision_rule(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_decision_rule(sequence(orf); kwargs) +# @inline log_odds_ratio_decision_rule(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_decision_rule(sequence(orf); kwargs) -const lors = log_odds_ratio_score +# const lors = log_odds_ratio_score diff --git a/src/iscoding.jl b/src/iscoding.jl deleted file mode 100644 index 803099c..0000000 --- a/src/iscoding.jl +++ /dev/null @@ -1,31 +0,0 @@ -export iscoding - -@doc raw""" - iscoding(sequence::LongSequence{DNAAlphabet{4}}; criteria::Function = lordr, kwargs...) -> Bool - -Check if a given DNA sequence is likely to be coding based on a scoring scheme. - -``` -sequence = dna"ATGGCATCTAG" -iscoding(sequence) # Returns: true or false -``` -""" -function iscoding end - -function iscoding( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - criteria::Function = lordr, - kwargs... -) where {N} - return criteria(sequence; kwargs...) -end - -function iscoding( - orf::ORFI{F}; - criteria::Function = lordr, - kwargs... -) where {F<:GeneFinderMethod} - return criteria(sequence(orf); kwargs...) -end - -# iscoding.(seq[i for i in allorfs]) \ No newline at end of file diff --git a/src/rbs.jl b/src/rbs.jl new file mode 100644 index 0000000..c68c6be --- /dev/null +++ b/src/rbs.jl @@ -0,0 +1,300 @@ +export RBS, orf_rbs_score, orbs, motifseq, offset +public _rbswindows, _findrbs + +""" + RBS(motif::LongSubSeq, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) + +Represents a Ribosome Binding Site (RBS) in a DNA sequence. + +# Fields +- `motif::LongSubSeq`: The nucleotide sequence of the RBS motif +- `offset::UnitRange{Int64}`: The position range of the RBS in the sequence +- `window::Symbol`: Symbol indicating the window type or region where the RBS was found +- `score::Int64`: Numerical score indicating the strength or confidence of the RBS prediction +- `strand::Strand`: Indicates whether the RBS is on the forward or reverse strand + +# Constructor + RBS(motif::LongSubSeq{A}, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) where {A<:NucleicAcidAlphabet} + +Creates a new RBS instance with the specified parameters. +""" +struct RBS + motif::LongSubSeq # motif + offset::UnitRange{Int64} # offset + window::Symbol + score::Int64 + strand::Strand + + function RBS(motif::LongSubSeq{A}, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) where {A<:NucleicAcidAlphabet} + return new(motif, offset, windowsymbol, score, strand) + end + # rbsinst = RBS(biore"RRR"dna, 3:4, 1.0) +end + +## based on https://github.com/deprekate/PHANOTATE/blob/8b4e728171adc7d900ccbc59f9606bfd585e138c/phanotate_modules/functions.py#L48 +const FORWARDRBSMOTIFS = Dict( + ExactSearchQuery(dna"GGAGGA", iscompatible) => 27, + ExactSearchQuery(dna"GGAGG", iscompatible) => 24, + ExactSearchQuery(dna"GAGGA", iscompatible) => 22, + ExactSearchQuery(dna"GGACGA", iscompatible) => 19, + ExactSearchQuery(dna"GGATGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAAGA", iscompatible) => 19, + ExactSearchQuery(dna"GGCGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGGGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGTGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAG", iscompatible) => 16, + ExactSearchQuery(dna"GAGG", iscompatible) => 16, + ExactSearchQuery(dna"AGGA", iscompatible) => 16, + ExactSearchQuery(dna"GGTGG", iscompatible) => 14, + ExactSearchQuery(dna"GGGGG", iscompatible) => 14, + ExactSearchQuery(dna"GGCGA", iscompatible) => 14, + ExactSearchQuery(dna"AGG", iscompatible) => 13, + ExactSearchQuery(dna"GAG", iscompatible) => 13, + ExactSearchQuery(dna"GGA", iscompatible) => 13, + ExactSearchQuery(dna"GAAGA", iscompatible) => 9, + ExactSearchQuery(dna"GATGA", iscompatible) => 9, + ExactSearchQuery(dna"GACGA", iscompatible) => 9, +) + +const REVERSERBSMOTIFS = Dict( + ExactSearchQuery(dna"TCCTCC", iscompatible) => 27, + ExactSearchQuery(dna"CCTCC", iscompatible) => 24, + ExactSearchQuery(dna"TCCTC", iscompatible) => 22, + ExactSearchQuery(dna"TCGTCC", iscompatible) => 19, + ExactSearchQuery(dna"TCATCC", iscompatible) => 19, + ExactSearchQuery(dna"TTCCTC", iscompatible) => 19, + ExactSearchQuery(dna"TCCGCC", iscompatible) => 19, + ExactSearchQuery(dna"TCCCCT", iscompatible) => 19, + ExactSearchQuery(dna"AGGTCC", iscompatible) => 19, + ExactSearchQuery(dna"CTCC", iscompatible) => 16, + ExactSearchQuery(dna"CCTC", iscompatible) => 16, + ExactSearchQuery(dna"TCCT", iscompatible) => 16, + ExactSearchQuery(dna"GGTCC", iscompatible) => 14, + ExactSearchQuery(dna"CCCCC", iscompatible) => 14, + ExactSearchQuery(dna"AGCCG", iscompatible) => 14, + ExactSearchQuery(dna"CCT", iscompatible) => 13, + ExactSearchQuery(dna"CTC", iscompatible) => 13, + ExactSearchQuery(dna"TCC", iscompatible) => 13, + ExactSearchQuery(dna"TCTTC", iscompatible) => 9, + ExactSearchQuery(dna"TCATC", iscompatible) => 9, + ExactSearchQuery(dna"TCGTC", iscompatible) => 9, +) + +## based on the Prodigal paper: +# Spacer Range | Representative RBS Motif(s) +# ------------ | --------------------------- +# 3-4 bp | GGA, GAG, AGG, AGxAG, AGGAG +# 5-10 bp | GGAG, GAGG, GGxGG, AGGAGG +# 11-12 bp | GGA, GAG, AGG, AGxAG, AGGAG +# 13-15 bp | GGA, GAG, AGG, AGGA, AGGAG + +## Window checking a: +# -10 -3 |-> start codon +# ...|......|..ATG... + +## Window checking b: +# -16 -5 |-> start codon +# ...|..........|....ATG... + +## Window checking c: +# -20 -11 |-> start codon +# .|........|..........ATG... + +## Example: +# ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif + +# -20 -15 -10 -5 |-> start codon +# |....|....|....|....ATG... +#EX1. GGAGGACCCCATGACACACACAACAC +# |----|:RBS(dna"GGAGGA", 1:5, :A, 27, STRAND_POS) + +""" + _rbswindows(orf::ORFI{N,F}) where {N,F} -> Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}} + +Generate potential ribosome binding site (RBS) windows for a given open reading frame (ORF). + +Returns a tuple of UnitRanges representing three possible RBS windows relative to the ORF's start/end position, +filtering out any windows with invalid positions (< 1). + +For positive strand ORFs: +- Window A: -10 to -3 positions upstream of start +- Window B: -16 to -5 positions upstream of start +- Window C: -20 to -11 positions upstream of start + +For negative strand ORFs: +- Window A: +3 to +10 positions downstream of end +- Window B: +5 to +16 positions downstream of end +- Window C: +11 to +20 positions downstream of end + +# Arguments +- `orf::ORFI{N,F}`: An ORF instance containing strand and position information + +# Returns +- Tuple of valid UnitRanges representing RBS windows +""" +function _rbswindows(orf::ORFI{N,F}) where {N,F} + windowa = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) + windowb = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) + windowc = orf.strand == STRAND_POS ? (orf.first-20:orf.first-11) : (orf.last+11:orf.last+20) + windows = (windowa, windowb, windowc) + return filter(window -> first(window) >= 1 && last(window) >= 1, windows) +end + +""" + _findrbs(orf::ORFI{N,F}) where {N,F} -> Vector{RBS} + +Search for Ribosome Binding Sites (RBS) in the given Open Reading Frame (ORF). + +This function analyzes potential RBS locations in three windows upstream of the ORF start codon. +For each window, it searches for predefined RBS motifs (either forward or reverse depending on +the strand orientation) and records their locations and scores. + +# Arguments +- `orf::ORFI{N,F}`: An Open Reading Frame Interface object containing sequence and strand information + +# Returns +- `Vector{RBS}`: A vector of RBS objects, each containing: + - The RBS sequence + - The position range in the source sequence + - A window symbol (`:A`, `:B`, or `:C`) + - A score value + - The strand orientation + +# Implementation Details +- Searches in three windows upstream of the ORF +- Uses different motif sets for positive and negative strands +- Identifies all occurrences of RBS motifs in each window +- Records both the sequence and its position information + +# Note +This is an internal and public function as indicated by the leading underscore. +""" +function _findrbs(orf::ORFI{N,F}) where {N,F} + # s = reverse(seq) + # score = 0 + wsymb = (:A, :B, :C) + rbsvect = Vector{RBS}() + windows = _rbswindows(orf) + + # Iterate over the windows and their corresponding symbols + for i in 1:3#length(windows) + window = windows[i] + symbol = wsymb[i] + wsqv = view(source(orf), window) + ## here we can apply a logic to decid whether to use the reverse motif or the forward motif + motifs = orf.strand == STRAND_POS ? FORWARDRBSMOTIFS : REVERSERBSMOTIFS + for (rbs, scr) in pairs(motifs) + motifranges::Vector{UnitRange{Int}} = findall(rbs, wsqv) + if !isempty(motifranges) + for motifrange in motifranges + offset = (first(window) + first(motifrange) - 1):(first(window) + last(motifrange) - 1) + # rbseq = orf.strand == STRAND_POS ? view(source(orf), offset) : reverse_complement(view(source(orf), offset)) + rbseq = view(wsqv,motifrange) + push!(rbsvect, RBS(rbseq, offset, symbol, scr, orf.strand)) + end + end + end + end + return rbsvect +end + +""" + orf_rbs_score(orf::ORFI{N,F}) where {N,F} -> Int64 + +Calculate a ribosome binding site (RBS) score for a given open reading frame (ORF). + +This function evaluates potential ribosome binding sites in three windows upstream of +the ORF start codon by searching for known RBS motifs. It returns a total score based +on the strongest motif found in each window. + +# Arguments +- `orf::ORFI{N,F}`: An open reading frame interface object containing sequence and position information + +# Returns +- `Float64`: The sum of the maximum RBS motif scores found in each window + +# Details +The function: +1. Divides the upstream region into three windows (a, b, c) +2. Searches each window for matching RBS motifs +3. Keeps track of the highest scoring motif in each window +4. Returns the sum of the maximum scores across all windows + +RBS motifs and their scores are predefined in `FORWARDRBSMOTIFS` for positive strand +and `REVERSERBSMOTIFS` for negative strand sequences. +""" +function orbs(orf::ORFI{N,F}) where {N,F} + # Initialize the score and the max scores dictionary + wsymb = (:a, :b, :c) + windows = _rbswindows(orf) + maxscores = Dict(:a => 0, :b => 0, :c => 0) + + # Iterate over the windows and their corresponding symbols + @inbounds for i in 1:3 + window = windows[i] + symbol = wsymb[i] + wsqv = view(source(orf), window) + + motifs = orf.strand == STRAND_POS ? FORWARDRBSMOTIFS : REVERSERBSMOTIFS + # Check for RBS motifs in the sequence view + @inbounds for (rbs, scr) in pairs(motifs) + if occursin(rbs, wsqv) + # Update the max score directly + if scr > maxscores[symbol] + maxscores[symbol] = scr + end + end + end + end + + # Sum the maximum scores of each window + totscore = sum(values(maxscores)) + + return totscore +end + +const orf_rbs_score = orbs + +export motifseq +function motifseq(rbs::RBS) + return rbs.motif +end + +export offset +function offset(rbs::RBS) + return rbs.offset +end + +# filter(x -> orf_rbs_score(x) > 9 && length(x) > 100, findorfs(phi)) + +# An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") +# and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) +# Use the memory layout of this findings to store also the frame, strand and view of the sequence. Figuring out the +# how the strand and the location will be defined in the memory layout is the next step. + +## Another idea would be to fastly count the number of ATG in a sequence twice. This will be the approx number of ORFs +## in the sequence.Them we can use other way to store the ORFs in the memory layout. + +## Another idea would be to use the findall function to find all the ATG in the sequence and then use the memory layout + +# export findrbs +# function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# # s = reverse(seq) +# score = 0 +# wsymb = (:a, :b, :c) +# rbsvect = Vector{RBS}() +# windows = _rbswindows(orf) + +# for i in 1:length(windows) +# window = windows[i] +# symbol = wsymb[i] +# sqv = @view seq[window] +# for (rbs, scr) in pairs(RBSMOTIFS) +# if occursin(rbs, sqv) +# push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) +# # score += scr # Use the value associated with the key +# end +# end +# end +# return rbsvect +# end \ No newline at end of file diff --git a/src/rbs2.jl b/src/rbs2.jl new file mode 100644 index 0000000..d90b929 --- /dev/null +++ b/src/rbs2.jl @@ -0,0 +1,88 @@ +### Another implementation + +# export RBSMOTIFS, RBSQUERYMOTIGS +# RBSMOTIFS = ( +# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", +# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", +# dna"GAAGA", dna"GATGA", dna"GACGA" +# ) + +# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) +# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + + +export RBSMOTIFS2 +RBSMOTIFS2 = ( + RBS(dna"GGAGGA", 5:10, 27, :A), + RBS(dna"GGAGGA", 3:8, 26, :B), + RBS(dna"GGAGGA", 11:16, 25, :C), + RBS(dna"GGAGG", 5:9, 24, :D), + RBS(dna"GGAGG", 3:8, 23, :E), + RBS(dna"GAGGA", 5:9, 22, :F), + RBS(dna"GAGGA", 3:8, 21, :G), + RBS(dna"GAGGA", 11:16, 20, :H), + RBS(dna"GGACGA", 5:10, 19, :I), + RBS(dna"GGATGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 5:10, 19, :I), + RBS(dna"GGCGGA", 5:10, 19, :I), + RBS(dna"GGGGGA", 5:10, 19, :I), + RBS(dna"GGTGGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 3:9, 18, :J), + RBS(dna"GGTGGA", 3:9, 18, :J), + RBS(dna"GGAAGA", 11:17, 17, :K), + RBS(dna"GGTGGA", 11:17, 17, :K), + RBS(dna"GGAG", 5:9, 16, :L), + RBS(dna"GAGG", 5:9, 16, :L), + RBS(dna"AGGA", 5:9, 15, :M), + RBS(dna"GGTGG", 5:9, 14, :N), + RBS(dna"GGGGG", 5:9, 14, :N), + RBS(dna"GGCNGG", 5:9, 14, :N), + RBS(dna"AGG", 5:8, 13, :O), + RBS(dna"GAG", 5:8, 13, :O), + RBS(dna"GGA", 5:8, 13, :O), + RBS(dna"AGGA", 11:15, 12, :P), + RBS(dna"AGGA", 3:7, 11, :Q), + RBS(dna"GAGGA", 13:19, 10, :R), + RBS(dna"GAAGA", 5:10, 9, :S), + RBS(dna"GATGA", 5:10, 9, :S), + RBS(dna"GACGA", 5:10, 9, :S), + RBS(dna"GGTGG", 3:8, 8, :T), + RBS(dna"GGTGG", 11:16, 7, :U), + RBS(dna"AGG", 11:14, 6, :V), + RBS(dna"GAAGA", 3:8, 5, :W), + RBS(dna"GAAGA", 11:16, 4, :X), + RBS(dna"AGGA", 13:17, 3, :Y), + RBS(dna"AGG", 13:16, 2, :Z), + RBS(dna"GGAAGA", 13:19, 2, :Z), + RBS(dna"GGTGG", 13:18, 2, :Z), + RBS(dna"AGG", 3:6, 1, :Z) +) + +export _motifscore +function _motifscore(seq::SeqOrView{A}) where {A} + score = 0 + for rbs in RBSMOTIFS2 + if occursin(rbs.motif, seq) + score += rbs.score + end + end + return score +end + +export RBSMOTIFS2QUERY +RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) + +export _findrbs2 +function _findrbs2(orf::ORFI{N,F}) where {N,F} + rbsvect = Vector{RBS}()#Vector{RBS}() + for i in 1:length(RBSMOTIFS2) + rbs = RBSMOTIFS2[i] + rbsq = RBSMOTIFS2QUERY[i] + match = findprev(rbsq, source(orf), i) #needs to deal with strand + if match != nothing + push!(rbsvect, RBS(rbs.motif, -(match, rbs.offset), rbs.score, rbs.window)) + println("Match found at: ", match) + end + end + return rbsvect +end