129 template <
typename stream_type,
130 typename seq_legal_alph_type,
131 typename stream_pos_type,
137 stream_pos_type & position_buffer,
140 qual_type & qualities);
142 template <
typename stream_type,
150 qual_type && qualities);
152 template <
typename stream_type,
153 typename seq_legal_alph_type,
154 typename ref_seqs_type,
155 typename ref_ids_type,
156 typename stream_pos_type,
159 typename ref_seq_type,
160 typename ref_id_type,
161 typename ref_offset_type,
167 typename tag_dict_type,
168 typename e_value_type,
169 typename bit_score_type>
172 ref_seqs_type & ref_seqs,
174 stream_pos_type & position_buffer,
178 ref_seq_type & SEQAN3_DOXYGEN_ONLY(
ref_seq),
181 cigar_type & cigar_vector,
185 tag_dict_type & tag_dict,
186 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
187 bit_score_type & SEQAN3_DOXYGEN_ONLY(
bit_score));
189 template <
typename stream_type,
190 typename header_type,
193 typename ref_seq_type,
194 typename ref_id_type,
197 typename tag_dict_type,
198 typename e_value_type,
199 typename bit_score_type>
202 header_type && header,
206 ref_seq_type && SEQAN3_DOXYGEN_ONLY(
ref_seq),
213 tag_dict_type && tag_dict,
214 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
215 bit_score_type && SEQAN3_DOXYGEN_ONLY(
bit_score));
225 sam_file_header<> default_header{};
237 template <
typename t>
238 decltype(
auto) default_or(t && v)
const noexcept
240 return std::forward<t>(v);
243 template <arithmetic value_type>
248 void read_sam_dict(
std::string_view const tag_str, sam_tag_dictionary & target);
250 template <
typename stream_it_t, std::ranges::forward_range field_type>
251 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
253 template <
typename stream_it_t>
254 void write_range_or_asterisk(stream_it_t & stream_it,
char const *
const field_value);
256 template <
typename stream_it_t>
257 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary
const & tag_dict,
char const separator);
261template <
typename stream_type,
262 typename seq_legal_alph_type,
263 typename stream_pos_type,
269 stream_pos_type & position_buffer,
272 qual_type & qualities)
297 if constexpr (!detail::decays_to_ignore_v<seq_type>)
298 if (std::ranges::distance(
sequence) == 0)
299 throw parse_error{
"The sequence information must not be empty."};
300 if constexpr (!detail::decays_to_ignore_v<id_type>)
301 if (std::ranges::distance(
id) == 0)
302 throw parse_error{
"The id information must not be empty."};
309template <
typename stream_type,
317 qual_type && qualities)
327 default_or(qualities),
342template <
typename stream_type,
343 typename seq_legal_alph_type,
344 typename ref_seqs_type,
345 typename ref_ids_type,
346 typename stream_pos_type,
349 typename ref_seq_type,
350 typename ref_id_type,
351 typename ref_offset_type,
357 typename tag_dict_type,
358 typename e_value_type,
359 typename bit_score_type>
363 ref_seqs_type & ref_seqs,
365 stream_pos_type & position_buffer,
369 ref_seq_type & SEQAN3_DOXYGEN_ONLY(
ref_seq),
372 cigar_type & cigar_vector,
376 tag_dict_type & tag_dict,
377 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
378 bit_score_type & SEQAN3_DOXYGEN_ONLY(
bit_score))
380 static_assert(detail::decays_to_ignore_v<ref_offset_type>
381 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
382 "The ref_offset must be a specialisation of std::optional.");
384 auto stream_it = detail::fast_istreambuf_iterator{*stream.rdbuf()};
386 auto stream_view = detail::istreambuf(stream);
388 int32_t ref_offset_tmp{};
389 std::ranges::range_value_t<
decltype(header.
ref_ids())> ref_id_tmp{};
395 read_header(stream_view, header, ref_seqs);
397 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
402 position_buffer = stream.tellg();
407 stream_it.cache_record_into(
'\n',
'\t', raw_record);
411 if constexpr (!detail::decays_to_ignore_v<id_type>)
412 read_forward_range_field(raw_record[0],
id);
414 uint16_t flag_integral{};
415 read_arithmetic_field(raw_record[1], flag_integral);
418 read_forward_range_field(raw_record[2], ref_id_tmp);
419 check_and_assign_ref_id(
ref_id, ref_id_tmp, header, ref_seqs);
421 read_arithmetic_field(raw_record[3], ref_offset_tmp);
424 if (ref_offset_tmp == -1)
426 else if (ref_offset_tmp > -1)
428 else if (ref_offset_tmp < -1)
429 throw format_error{
"No negative values are allowed for field::ref_offset."};
431 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
432 read_arithmetic_field(raw_record[4],
mapq);
436 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
437 cigar_vector = detail::parse_cigar(raw_record[5]);
441 if constexpr (!detail::decays_to_ignore_v<mate_type>)
443 std::ranges::range_value_t<
decltype(header.
ref_ids())> tmp_mate_ref_id{};
444 read_forward_range_field(raw_record[6], tmp_mate_ref_id);
446 if (tmp_mate_ref_id ==
"=")
448 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
451 check_and_assign_ref_id(get<0>(
mate), ref_id_tmp, header, ref_seqs);
455 check_and_assign_ref_id(get<0>(
mate), tmp_mate_ref_id, header, ref_seqs);
459 read_arithmetic_field(raw_record[7], tmp_pnext);
462 get<1>(
mate) = --tmp_pnext;
463 else if (tmp_pnext < 0)
464 throw format_error{
"No negative values are allowed at the mate mapping position."};
467 read_arithmetic_field(raw_record[8], get<2>(
mate));
472 if constexpr (!detail::decays_to_ignore_v<seq_type>)
476 if (!seq_str.starts_with(
'*'))
479 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
481 for (
size_t i = 0; i < seq_str.
size(); ++i)
483 if (!is_legal_alph(seq_str[i]))
485 + detail::type_name_as_string<seq_legal_alph_type>
486 +
"> evaluated to false on " + detail::make_printable(seq_str[i])};
498 size_t tag_begin_pos = raw_record[10].find(
'\t');
501 (tag_begin_pos == std::string_view::npos) ? raw_record[10] : raw_record[10].substr(0, tag_begin_pos);
503 if constexpr (!detail::decays_to_ignore_v<qual_type>)
504 read_forward_range_field(qualities,
qual);
506 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
508 if (std::ranges::distance(
seq) != 0 && std::ranges::distance(
qual) != 0
509 && std::ranges::distance(
seq) != std::ranges::distance(
qual))
511 throw format_error{detail::to_string(
"Sequence length (",
512 std::ranges::distance(
seq),
513 ") and quality length (",
514 std::ranges::distance(
qual),
515 ") must be the same.")};
521 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
523 while (tag_begin_pos != std::string_view::npos)
526 size_t const tag_end_pos = raw_record[10].find(
'\t', tag_begin_pos);
528 char const * tag_begin = raw_record[10].
begin() + tag_begin_pos;
529 char const * tag_end =
530 (tag_end_pos == std::string_view::npos) ? raw_record[10].end() : raw_record[10].
begin() + tag_end_pos;
534 tag_begin_pos = tag_end_pos;
538 assert(stream_it == std::default_sentinel_t{} || *stream_it ==
'\n');
543template <
typename stream_type,
544 typename header_type,
547 typename ref_seq_type,
548 typename ref_id_type,
551 typename tag_dict_type,
552 typename e_value_type,
553 typename bit_score_type>
556 header_type && header,
560 ref_seq_type && SEQAN3_DOXYGEN_ONLY(
ref_seq),
567 tag_dict_type && tag_dict,
568 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
569 bit_score_type && SEQAN3_DOXYGEN_ONLY(
bit_score))
588 "The seq object must be a std::ranges::forward_range over "
589 "letters that model seqan3::alphabet.");
592 "The id object must be a std::ranges::forward_range over "
593 "letters that model seqan3::alphabet.");
595 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
597 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
598 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
599 "The ref_id object must be a std::ranges::forward_range "
600 "over letters that model seqan3::alphabet.");
602 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
603 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>)
604 static_assert(!detail::decays_to_ignore_v<header_type>,
605 "If you give indices as reference id information the header must also be present.");
609 "The qual object must be a std::ranges::forward_range "
610 "over letters that model seqan3::alphabet.");
613 "The mate object must be a std::tuple of size 3 with "
614 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
615 "2) a std::integral or std::optional<std::integral>, and "
616 "3) a std::integral.");
619 ((std::ranges::forward_range<decltype(std::get<0>(
mate))>
620 || std::integral<std::remove_cvref_t<
decltype(std::get<0>(
mate))>>
621 || detail::is_type_specialisation_of_v<
622 std::remove_cvref_t<
decltype(std::get<0>(
mate))>,
623 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>>
624 || detail::is_type_specialisation_of_v<
625 std::remove_cvref_t<
decltype(std::get<1>(
mate))>,
626 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
627 "The mate object must be a std::tuple of size 3 with "
628 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
629 "2) a std::integral or std::optional<std::integral>, and "
630 "3) a std::integral.");
632 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(
mate))>>
633 || detail::is_type_specialisation_of_v<std::remove_cvref_t<
decltype(std::get<0>(
mate))>,
635 static_assert(!detail::decays_to_ignore_v<header_type>,
636 "If you give indices as mate reference id information the header must also be present.");
639 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
644 if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
645 && !std::integral<std::remove_reference_t<ref_id_type>>
646 && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>,
std::optional>)
653 if constexpr (std::ranges::contiguous_range<
decltype(
ref_id)> && std::ranges::sized_range<
decltype(
ref_id)>
654 && std::ranges::borrowed_range<
decltype(
ref_id)>)
663 "The ref_id type is not convertible to the reference id information stored in the "
664 "reference dictionary of the header object.");
672 "' was not in the list of references:",
678 throw format_error{
"The ref_offset object must be a std::integral >= 0."};
683 if constexpr (!detail::decays_to_ignore_v<header_type>)
687 write_header(stream, options, header);
688 header_was_written =
true;
696 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
697 constexpr char separator{
'\t'};
699 write_range_or_asterisk(stream_it,
id);
700 *stream_it = separator;
702 stream_it.write_number(
static_cast<uint16_t
>(
flag));
703 *stream_it = separator;
705 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
707 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
709 write_range_or_asterisk(stream_it, (header.
ref_ids())[
ref_id]);
711 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>,
std::optional>)
714 write_range_or_asterisk(stream_it, (header.
ref_ids())[
ref_id.value()]);
720 write_range_or_asterisk(stream_it,
ref_id);
728 *stream_it = separator;
731 stream_it.write_number(
ref_offset.value_or(-1) + 1);
732 *stream_it = separator;
734 stream_it.write_number(
static_cast<unsigned>(
mapq));
735 *stream_it = separator;
737 if (!std::ranges::empty(cigar_vector))
739 for (
auto & c : cigar_vector)
740 stream_it.write_range(c.to_string());
747 *stream_it = separator;
749 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(
mate))>>)
751 write_range_or_asterisk(stream_it, (header.
ref_ids())[get<0>(
mate)]);
753 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(
mate))>,
756 if (get<0>(
mate).has_value())
757 write_range_or_asterisk(stream_it, header.
ref_ids()[get<0>(
mate).value()]);
763 write_range_or_asterisk(stream_it, get<0>(
mate));
766 *stream_it = separator;
768 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(
mate))>,
std::optional>)
771 stream_it.write_number(get<1>(
mate).value_or(-1) + 1);
772 *stream_it = separator;
776 stream_it.write_number(get<1>(
mate));
777 *stream_it = separator;
780 stream_it.write_number(get<2>(
mate));
781 *stream_it = separator;
783 write_range_or_asterisk(stream_it,
seq);
784 *stream_it = separator;
786 write_range_or_asterisk(stream_it,
qual);
788 write_tag_fields(stream_it, tag_dict, separator);
809template <arithmetic value_type>
818 while (start_pos != std::string_view::npos)
820 end_pos = str.
find(
',', start_pos);
821 auto end = (end_pos == std::string_view::npos) ? str.
end() : str.
begin() + end_pos;
823 tmp_vector.push_back(value);
825 start_pos = (end_pos == std::string_view::npos) ? end_pos : end_pos + 1;
827 variant = std::move(tmp_vector);
845 uint8_t dummy_byte{};
847 if (str.
size() % 2 != 0)
848 throw format_error{
"[CORRUPTED SAM FILE] Hexadecimal tag must have even number of digits."};
852 for (
auto hex_begin = str.
begin(), hex_end = str.
begin() + 2; hex_begin != str.
end(); hex_begin += 2, hex_end += 2)
854 auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);
856 if (res.ec == std::errc::invalid_argument)
858 +
"' could not be cast into type uint8_t."};
860 if (res.ec == std::errc::result_out_of_range)
862 +
"' into type uint8_t would cause an overflow."};
864 tmp_vector.
push_back(std::byte{dummy_byte});
867 variant = std::move(tmp_vector);
885inline void format_sam::read_sam_dict(
std::string_view const tag_str, sam_tag_dictionary & target)
892 assert(tag_str.
size() > 5);
894 uint16_t tag =
static_cast<uint16_t
>(tag_str[0]) << 8;
895 tag +=
static_cast<uint16_t
>(tag_str[1]);
897 char type_id = tag_str[3];
903 assert(tag_str.
size() == 6);
904 target[tag] = tag_str[5];
910 read_arithmetic_field(tag_str.
substr(5), tmp);
917 read_arithmetic_field(tag_str.
substr(5), tmp);
928 read_sam_byte_vector(target[tag], tag_str.
substr(5));
933 assert(tag_str.
size() > 6);
934 char array_value_type_id = tag_str[5];
936 switch (array_value_type_id)
939 read_sam_dict_vector(target[tag], tag_str.
substr(7), int8_t{});
942 read_sam_dict_vector(target[tag], tag_str.
substr(7), uint8_t{});
945 read_sam_dict_vector(target[tag], tag_str.
substr(7), int16_t{});
948 read_sam_dict_vector(target[tag], tag_str.
substr(7), uint16_t{});
951 read_sam_dict_vector(target[tag], tag_str.
substr(7), int32_t{});
954 read_sam_dict_vector(target[tag], tag_str.
substr(7), uint32_t{});
957 read_sam_dict_vector(target[tag], tag_str.
substr(7),
float{});
960 throw format_error{
std::string(
"The first character in the numerical ")
961 +
"id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
967 throw format_error{
std::string(
"The second character in the numerical id of a "
968 "SAM tag ([TAG]:[TYPE_ID]:[VALUE]) must be one of [A,i,Z,H,B,f] but '")
969 + type_id +
"' was given."};
980template <
typename stream_it_t, std::ranges::forward_range field_type>
981inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
983 if (std::ranges::empty(field_value))
989 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>,
char>)
990 stream_it.write_range(field_value);
1002template <
typename stream_it_t>
1003inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it,
char const *
const field_value)
1015template <
typename stream_it_t>
1017format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary
const & tag_dict,
char const separator)
1019 auto const stream_variant_fn = [&stream_it](
auto && arg)
1021 using T = std::remove_cvref_t<
decltype(arg)>;
1023 if constexpr (std::ranges::input_range<T>)
1025 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>,
char>)
1027 stream_it.write_range(arg);
1029 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1031 if (!std::ranges::empty(arg))
1033 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1035 for (
auto && elem : arg |
std::views::
drop(1))
1038 stream_it.write_number(std::to_integer<uint8_t>(elem));
1044 if (!std::ranges::empty(arg))
1046 stream_it.write_number(*std::ranges::begin(arg));
1048 for (
auto && elem : arg |
std::views::
drop(1))
1051 stream_it.write_number(elem);
1056 else if constexpr (std::same_as<std::remove_cvref_t<T>,
char>)
1062 stream_it.write_number(arg);
1066 for (
auto & [tag, variant] : tag_dict)
1068 *stream_it = separator;
1070 char const char0 = tag / 256;
1071 char const char1 = tag % 256;
1076 *stream_it = detail::sam_tag_type_char[variant.
index()];
1079 if (detail::sam_tag_type_char_extra[variant.
index()] !=
'\0')
1081 *stream_it = detail::sam_tag_type_char_extra[variant.
index()];
Core alphabet concept and free function/type trait wrappers.
The SAM tag dictionary class that stores all optional SAM fields.
Definition sam_tag_dictionary.hpp:330
Provides seqan3::detail::fast_ostreambuf_iterator.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition to_char.hpp:63
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:524
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ bit_score
The bit score (statistical significance indicator), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:63
constexpr auto is_space
Checks whether c is a space character.
Definition predicate.hpp:125
seqan::stl::ranges::to to
Converts a range to a container. <dl class="no-api">This entity is not part of the SeqAn API....
Definition to.hpp:26
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition type_list/traits.hpp:395
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition io/exception.hpp:48
The options type defines various option members that influence the behavior of all or some formats.
Definition sam_file/output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition sam_file/output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition sam_file/output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition sequence_file/output_options.hpp:26
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::ranges::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.