Merge pull request #57 from poseidon-framework/moreFor270

stschiff · web-flow · commit b92ebc6d994b · 2023-03-20T11:09:59.000+01:00
suggestions for additional changes in 2.7.0
diff --git a/README.md b/README.md
@@ -19,8 +19,9 @@ Every package should have the following files:
 
 It can also contain the following files:
 
-- A `README.txt` file for arbitrary context information
-- A `CHANGELOG.txt` file to document changes to the package
+- A `README.md` file for arbitrary context information
+- A `CHANGELOG.md` file to document changes to the package
+- A `.ssf` file with information on the underlying raw sequencing data
 
 Example:
 
@@ -30,9 +31,10 @@ Switzerland_LNBA_Roswita/Switzerland_LNBA.plink.bed
 Switzerland_LNBA_Roswita/Switzerland_LNBA.plink.bim
 Switzerland_LNBA_Roswita/Switzerland_LNBA.plink.fam
 Switzerland_LNBA_Roswita/Switzerland_LNBA.janno
+Switzerland_LNBA_Roswita/Switzerland_LNBA.ssf
 Switzerland_LNBA_Roswita/Switzerland_LNBA.bib
-Switzerland_LNBA_Roswita/README.txt
-Switzerland_LNBA_Roswita/CHANGELOG.txt
+Switzerland_LNBA_Roswita/README.md
+Switzerland_LNBA_Roswita/CHANGELOG.md
 ```
 
 ## The `POSEIDON.yml` file
@@ -47,7 +49,7 @@ Example:
 ```
 poseidonVersion: 2.5.0
 title: Switzerland_LNBA_Roswita
-description: LNBA Switzerland genetic data not yet published # optional
+description: LNBA Switzerland genetic data not yet published
 contributor:
   - name: Roswita Malone
     email: roswita.malone@example.org
@@ -58,18 +60,20 @@ lastModified: 2021-01-28
 genotypeData:	
   format: PLINK	
   genoFile: Switzerland_LNBA_Roswita.bed
-  genoFileChkSum: 95b093eefacc1d6499afcfe89b15d56c # optional
+  genoFileChkSum: 95b093eefacc1d6499afcfe89b15d56c
   snpFile: Switzerland_LNBA_Roswita.bim
-  snpFileChkSum: 6771d7c873219039ba3d5bdd96031ce3 # optional
+  snpFileChkSum: 6771d7c873219039ba3d5bdd96031ce3
   indFile: Switzerland_LNBA_Roswita.fam
-  indFileChkSum: f77dc756666dbfef3bb35191ae15a167 # optional
+  indFileChkSum: f77dc756666dbfef3bb35191ae15a167
   snpSet: 1240K
 jannoFile : Switzerland_LNBA_Roswita.janno
-jannoFileChkSum: 555d7733135ebcabd032d581381c5d6f # optional
-bibFile: sources.bib
-bibFileChkSum: 70cd3d5801cee8a93fc2eb40a99c63fa # optional
-readmeFile: README.txt # optional
-changelogFile: CHANGELOG.txt # optional
+jannoFileChkSum: 555d7733135ebcabd032d581381c5d6f
+sequencingSourceFile: Switzerland_LNBA_Roswita.ssf
+sequencingSourceFileChkSum: 19db1906240ee2f076e1a9659567dca4
+bibFile: Switzerland_LNBA_Roswita.bib
+bibFileChkSum: 70cd3d5801cee8a93fc2eb40a99c63fa
+readmeFile: README.md
+changelogFile: CHANGELOG.md
 ```
 
 When a package is modified in any way (e.g. updates of the context information in the `.janno` file), then the `packageVersion` field should be incremented and the `lastModified` field updated to the current date.
@@ -125,7 +129,7 @@ Example:
 }
 ```
 
-## The `README.txt` file
+## The `README.md` file
 
 Informal information accompanying the package.
 
@@ -135,19 +139,22 @@ Example:
 This package contains a rather interesting set of samples relevant for the peopling of the Territory of Christmas Island in the Indian Ocean. We consider this especially relevant, because ...
 ```
 
-## The `CHANGELOG.txt` file
+## The `CHANGELOG.md` file
 
 Documentation of important changes in the history of a package.
 
 Example:
 
 ```
-V 1.2.0: Fixed a spelling mistake in the site name "Hosenacker"->"Rosenacker"
-V 1.1.1: Added mtDNA contamination estimation to .janno file
-V 1.1.0: The authors of @Gassenhauer_2021 made some previously restricted samples for their publication available later and we added them
-V 1.0.0: Creation of the package
+- V 1.2.0: Fixed a spelling mistake in the site name "Hosenacker"->"Rosenacker"
+- V 1.1.1: Added mtDNA contamination estimation to .janno file
+- V 1.1.0: The authors of @Gassenhauer_2021 made some previously restricted samples for their publication available later and we added them
+- V 1.0.0: Creation of the package
 ```
 
-## The Sequencing Source file
+## The `.ssf` file
+
+Poseidon 2.7.0 added an option to specify sequencing source data. This is a tab-separated table, much like the `.janno` file, but following a different schema, specified in the file `ssf_columns.tsv`.
+
+Note that the primary entities in this table are sequencing entities (typically corresponding to DNA libraries or even multiple runs/lanes of the same library). The link to the Individuals listed in the `.janno`-file are made through a foreign-key relationship into `Poseidon_ID`.
 
-Poseidon 2.7.0 added an option to specify sequencing source data. This is a tab-separated table, much like the Janno file, but following a different schema, specified in the file `sequencingSourceFile_columns.tsv`. Note that the primary entities in this table are Sequencing entities (typically corresponding to DNA libraries or even multiple runs/lanes of the same library). The link to the Individuals listed in the Janno-file are made through a foreign-key relationship into `Poseidon_ID`.
diff --git a/janno_columns.tsv b/janno_columns.tsv
@@ -9,6 +9,7 @@ Relation_Type	relationship type for relatives mentioned in Related_To as an arbi
 Relation_Note	arbitrary comments about the relations of this individual	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Collection_ID	id as defined by the provider/owner of a sample (e.g. grave 40 skeleton 2)	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Country	present-day political country	String	FALSE	FALSE	FALSE				FALSE	FALSE
+Country_ISO	present-day political country expressed in ISO 3166-1 alpha-2 country codes	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Location	unspecified location information like administrative or topographic region or mountains/rivers/lakes/cities nearby	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Site	site name	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Latitude	latitude with up to 5 places after the decimal point	Float	FALSE	FALSE	TRUE		-90	90	FALSE	FALSE
@@ -23,11 +24,12 @@ Date_BC_AD_Stop	upper (more recent) bound for the age, negative numbers for BC,
 Date_Note	a free text field for arbitrary comments about the dating information	String	FALSE	FALSE	FALSE				FALSE	FALSE
 MT_Haplogroup	mitochondrial haplogroup after phylotree.org as reported by Haplofind or Haplogrep	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Y_Haplogroup	Y-chromosome haplogroup reported as published, for internal data, please follow syntax with main branch + most terminal derived Y-SNP (e.g. R1b-P312)	String	FALSE	FALSE	FALSE				FALSE	FALSE
-Source_Tissue	skeletal/tissue/source elements, specific bone name should be reported with an underscore (e.g. bone_phalanx), multiple values separated by ; in case of multiple libraries	String	TRUE	FALSE	FALSE				FALSE	FALSE
+Source_Tissue	skeletal/tissue/source elements, specific bone name should be reported with an underscore (e.g. bone_phalanx), multiple values separated by ;	String	TRUE	FALSE	FALSE				FALSE	FALSE
 Nr_Libraries	number of libraries	Integer	FALSE	FALSE	FALSE				FALSE	FALSE
+Library_Names	identifiers of the libraries used to generate the genotype data, multiple values separated by ;	String	TRUE	FALSE	FALSE				FALSE	FALSE
 Capture_Type	specifics of data generation method, multiple values separated by ;	String	TRUE	TRUE	FALSE	Shotgun;1240K;ArborComplete;ArborPrimePlus;ArborAncestralPlus;TwistAncientDNA;OtherCapture;ReferenceGenome			FALSE	FALSE
-UDG 	“mixed” in case multiple libraries with different UDG treatment were merged	String	FALSE	TRUE	FALSE	minus;half;plus;mixed			FALSE	FALSE
-Library_Built	“ds” for double stranded, “ss” for single stranded, “mixed” in case multiple libraries with different protocols were merged	String	FALSE	TRUE	FALSE	ds;ss;other			FALSE	FALSE
+UDG 	UDG treatment, “mixed” in case multiple libraries with different UDG treatment were merged	String	FALSE	TRUE	FALSE	minus;half;plus;mixed			FALSE	FALSE
+Library_Built	strandedness, “mixed” in case multiple libraries with different protocols were merged	String	FALSE	TRUE	FALSE	ds;ss;mixed			FALSE	FALSE
 Genotype_Ploidy	ploidy of the genotypes	String	FALSE	TRUE	FALSE	diploid;haploid			FALSE	FALSE
 Data_Preparation_Pipeline_URL	URL pointing to a description of the pipeline used to generate the genotype data from the source data	String	FALSE	FALSE	FALSE				FALSE	FALSE
 Endogenous	% endogenous DNA as estimated from SG libraries (before capture), as for example estimated by EAGER, not on target and no quality filter, in case of multiple libraries report only the highest value	Float	FALSE	FALSE	TRUE		0	100	FALSE	FALSE
diff --git a/ssf_columns.tsv b/ssf_columns.tsv
@@ -1,6 +1,7 @@
 sequencingSourceFile_column_name	description	data_type	multi	choice	range	choice_options	range_lower	range_upper	mandatory	unique
-
-Poseidon_ID	The Poseidon_ID field that this sequencing entity corresponds to, from the Janno-file.	String	FALSE	FALSE	FALSE				TRUE	FALSE
+poseidon_IDs	The Poseidon_IDs this sequencing entity corresponds to, from the Janno-file, multiple entries separated by ;	String	TRUE	FALSE	FALSE				TRUE	FALSE
+udg	UDG treatment	String	FALSE	TRUE	FALSE	minus;half;plus			FALSE	FALSE
+library_built	strandedness	String	FALSE	TRUE	FALSE	ds;ss			FALSE	FALSE
 sample_accession	The sample accession code as used in INSDC databases, including ENA and SRA (Example: SAMEA7050454)	String	FALSE	FALSE	FALSE				TRUE	TRUE
 study_accession	The study accession code as used in INSDC databases, including ENA and SRA (Example: PRJEB39316)	String	FALSE	FALSE	FALSE				FALSE	FALSE
 run_accession	The run accession code as used in INSDC databases, including ENA and SRA (Example: ERR4331996)	String	FALSE	FALSE	FALSE				FALSE	FALSE
@@ -19,4 +20,4 @@ fastq_aspera	The Aspera-link (URL) to the FASTQ-file(s). (Example: fasp.sra.ebi.
 fastq_bytes	The number of bytes of the FASTQ-file(s) in bytes	Integer	TRUE	FALSE	TRUE		0	Inf	FALSE	FALSE
 fastq_md5	The MD5 hash(es) of the FASTQ-file(s)	String	TRUE	FALSE	FALSE				FALSE	FALSE
 read_count	The number of reads	Integer	FALSE	FALSE	TRUE		0	Inf	FALSE	FALSE
-submitted_ftp	The URL(s) to the originally submitted file(s) before it got converted to FASTQ. This can sometimes be helpful for processing	String	TRUE	FALSE	FALSE				FALSE	FALSE
+submitted_ftp	The URL(s) to the originally submitted file(s) before it got converted to FASTQ. This can sometimes be helpful for processing	String	TRUE	FALSE	FALSE				FALSE	FALSE