From 427d23cb3378d25a2ecd240c6870423fbaf01066 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 5 May 2026 11:59:43 +0100 Subject: [PATCH] Clarify how defaults work when building an index with tabix As the tabix index builder does file format detection, it's a bit misleading to say that the default for, say, -b is always 4. Instead, add a paragraph to the help text and the man page explaining in more detail how the configuration options work. --- tabix.1 | 31 +++++++++++++++++++++++++------ tabix.c | 15 ++++++++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/tabix.1 b/tabix.1 index 449c60cfe..4b59df2f5 100644 --- a/tabix.1 +++ b/tabix.1 @@ -87,17 +87,17 @@ Specify that the position in the data file is 0-based half-open (e.g. UCSC files) rather than 1-based. .TP .BI "-b, --begin " INT -Column of start chromosomal position. [4] +Column of start chromosomal position. .TP .BI "-c, --comment " CHAR -Skip lines started with character CHAR. [#] +Skip lines started with character CHAR. .TP .BI "-C, --csi" Produce CSI format index instead of classical tabix or BAI style indices. .TP .BI "-e, --end " INT Column of end chromosomal position. The end column can be the same as the -start column. [5] +start column. .TP .B "-f, --force " Force to overwrite the index file if it is present. @@ -110,15 +110,15 @@ Input format for indexing. Valid values are: gff, bed, sam, vcf. This option should not be applied together with any of .BR -s ", " -b ", " -e ", " -c " and " -0 ; it is not used for data retrieval because this setting is stored in -the index file. [gff] +the index file. .TP .BI "-s, --sequence " INT Column of sequence name. Option .BR -s ", " -b ", " -e ", " -S ", " -c " and " -0 -are all stored in the index file and thus not used in data retrieval. [1] +are all stored in the index file and thus not used in data retrieval. .TP .BI "-S, --skip-lines " INT -Skip first INT lines in the data file. [0] +Skip first INT lines in the data file. .SH QUERYING AND OTHER OPTIONS .TP @@ -173,6 +173,25 @@ Values higher than 3 produce additional informational and debugging messages. .BI "-@, --threads " INT Set number of threads to use for the operation. The default is 0, where no extra threads are in use. +.PP +Unless one of the options +.BR -0 , +.BR -b , +.BR -c , +.BR -e , +.BR -p , +.BR -s , +or +.B -S +is used when building an index, +tabix will attempt to set its configuration by detecting the file type being +indexed. +If detection fails, or any of the options above is present, +the settings used will be those for the gff preset unless overridden +.br +i.e. one-based positions, +.B -s 1 -b 4 -e 5 -c '#' -S 0 + .PP .SH EXAMPLE (grep "^#" in.gff; grep -v "^#" in.gff | sort -t"`printf '\(rst'`" -k1,1 -k4,4n) | bgzip > sorted.gff.gz; diff --git a/tabix.c b/tabix.c index bd7d100f6..79d215b69 100644 --- a/tabix.c +++ b/tabix.c @@ -585,15 +585,15 @@ static int usage(FILE *fp, int status) fprintf(fp, "\n"); fprintf(fp, "Indexing Options:\n"); fprintf(fp, " -0, --zero-based coordinates are zero-based\n"); - fprintf(fp, " -b, --begin INT column number for region start [4]\n"); - fprintf(fp, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n"); + fprintf(fp, " -b, --begin INT column number for region start\n"); + fprintf(fp, " -c, --comment CHAR skip comment lines starting with CHAR\n"); fprintf(fp, " -C, --csi generate CSI index for VCF (default is TBI)\n"); - fprintf(fp, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n"); + fprintf(fp, " -e, --end INT column number for region end (if no end, set INT to -b)\n"); fprintf(fp, " -f, --force overwrite existing index without asking\n"); fprintf(fp, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); fprintf(fp, " -p, --preset STR gff, bed, sam, vcf, gaf\n"); - fprintf(fp, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n"); - fprintf(fp, " -S, --skip-lines INT skip first INT lines [0]\n"); + fprintf(fp, " -s, --sequence INT column number for sequence names (suppressed by -p)\n"); + fprintf(fp, " -S, --skip-lines INT skip first INT lines\n"); fprintf(fp, "\n"); fprintf(fp, "Querying and other options:\n"); fprintf(fp, " -h, --print-header print also the header lines\n"); @@ -608,6 +608,11 @@ static int usage(FILE *fp, int status) fprintf(fp, " --verbosity INT set verbosity [3]\n"); fprintf(fp, " -@, --threads INT number of additional threads to use [0]\n"); fprintf(fp, "\n"); + fprintf(fp, "Unless one of the options -0, -b, -c, -e, -p, -s, or -S is used when building\n"); + fprintf(fp, "an index, tabix will attempt to set its configuration by detecting the file\n"); + fprintf(fp, "type being indexed. If detection fails, or any of the options above is present,\n"); + fprintf(fp, "the settings used will be those for the gff preset unless overridden\n"); + fprintf(fp, "i.e. one-based positions, -s 1 -b 4 -e 5 -c '#' -S 0\n"); return status; }