SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_sam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <iterator>
16#include <ranges>
17#include <string>
18#include <vector>
19
38
39namespace seqan3
40{
41
107class format_sam : protected detail::format_sam_base
108{
109public:
113 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
114 format_sam() = default;
115 format_sam(format_sam const &) = delete;
116 format_sam & operator=(format_sam const &) = delete;
117 format_sam(format_sam &&) = default;
119 ~format_sam() = default;
120
122
125 {"sam"},
126 };
127
128protected:
129 template <typename stream_type, // constraints checked by file
130 typename seq_legal_alph_type,
131 typename stream_pos_type,
132 typename seq_type, // other constraints checked inside function
133 typename id_type,
134 typename qual_type>
135 void read_sequence_record(stream_type & stream,
137 stream_pos_type & position_buffer,
138 seq_type & sequence,
139 id_type & id,
140 qual_type & qualities);
141
142 template <typename stream_type, // constraints checked by file
143 typename seq_type, // other constraints checked inside function
144 typename id_type,
145 typename qual_type>
146 void write_sequence_record(stream_type & stream,
147 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
148 seq_type && sequence,
149 id_type && id,
150 qual_type && qualities);
151
152 template <typename stream_type, // constraints checked by file
153 typename seq_legal_alph_type,
154 typename ref_seqs_type,
155 typename ref_ids_type,
156 typename stream_pos_type,
157 typename seq_type,
158 typename id_type,
159 typename ref_seq_type,
160 typename ref_id_type,
161 typename ref_offset_type,
162 typename cigar_type,
163 typename flag_type,
164 typename mapq_type,
165 typename qual_type,
166 typename mate_type,
167 typename tag_dict_type,
168 typename e_value_type,
169 typename bit_score_type>
170 void read_alignment_record(stream_type & stream,
171 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
172 ref_seqs_type & ref_seqs,
174 stream_pos_type & position_buffer,
175 seq_type & seq,
176 qual_type & qual,
177 id_type & id,
178 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
179 ref_id_type & ref_id,
180 ref_offset_type & ref_offset,
181 cigar_type & cigar_vector,
182 flag_type & flag,
183 mapq_type & mapq,
184 mate_type & mate,
185 tag_dict_type & tag_dict,
186 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
187 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
188
189 template <typename stream_type,
190 typename header_type,
191 typename seq_type,
192 typename id_type,
193 typename ref_seq_type,
194 typename ref_id_type,
195 typename qual_type,
196 typename mate_type,
197 typename tag_dict_type,
198 typename e_value_type,
199 typename bit_score_type>
200 void write_alignment_record(stream_type & stream,
201 sam_file_output_options const & options,
202 header_type && header,
203 seq_type && seq,
204 qual_type && qual,
205 id_type && id,
206 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
207 ref_id_type && ref_id,
209 std::vector<cigar> const & cigar_vector,
210 sam_flag const flag,
211 uint8_t const mapq,
212 mate_type && mate,
213 tag_dict_type && tag_dict,
214 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
215 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
216
217private:
219 std::string tmp_qual{};
220
222 static constexpr std::string_view dummy{};
223
225 sam_file_header<> default_header{};
226
229
231 std::string_view const & default_or(detail::ignore_t) const noexcept
232 {
233 return dummy;
234 }
235
237 template <typename t>
238 decltype(auto) default_or(t && v) const noexcept
239 {
240 return std::forward<t>(v);
241 }
242
243 template <arithmetic value_type>
244 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, std::string_view const str, value_type value);
245
246 void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, std::string_view const str);
247
248 void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary & target);
249
250 template <typename stream_it_t, std::ranges::forward_range field_type>
251 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
252
253 template <typename stream_it_t>
254 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
255
256 template <typename stream_it_t>
257 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
258};
259
261template <typename stream_type, // constraints checked by file
262 typename seq_legal_alph_type,
263 typename stream_pos_type,
264 typename seq_type, // other constraints checked inside function
265 typename id_type,
266 typename qual_type>
267inline void format_sam::read_sequence_record(stream_type & stream,
269 stream_pos_type & position_buffer,
270 seq_type & sequence,
271 id_type & id,
272 qual_type & qualities)
273{
275
276 {
278 align_options,
279 std::ignore,
280 default_header,
281 position_buffer,
282 sequence,
283 qualities,
284 id,
285 std::ignore,
286 std::ignore,
287 std::ignore,
288 std::ignore,
289 std::ignore,
290 std::ignore,
291 std::ignore,
292 std::ignore,
293 std::ignore,
294 std::ignore);
295 }
296
297 if constexpr (!detail::decays_to_ignore_v<seq_type>)
298 if (std::ranges::distance(sequence) == 0)
299 throw parse_error{"The sequence information must not be empty."};
300 if constexpr (!detail::decays_to_ignore_v<id_type>)
301 if (std::ranges::distance(id) == 0)
302 throw parse_error{"The id information must not be empty."};
303
304 if (options.truncate_ids)
305 id = id | detail::take_until_and_consume(is_space) | ranges::to<id_type>();
306}
307
309template <typename stream_type, // constraints checked by file
310 typename seq_type, // other constraints checked inside function
311 typename id_type,
312 typename qual_type>
313inline void format_sam::write_sequence_record(stream_type & stream,
314 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
315 seq_type && sequence,
316 id_type && id,
317 qual_type && qualities)
318{
319 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
320
321 sam_file_output_options output_options;
322
324 output_options,
325 /*header*/ std::ignore,
326 /*seq*/ default_or(sequence),
327 /*qual*/ default_or(qualities),
328 /*id*/ default_or(id),
329 /*ref_seq*/ std::string_view{},
330 /*ref_id*/ std::string_view{},
331 /*ref_offset*/ -1,
332 /*cigar_vector*/ std::vector<cigar>{},
333 /*flag*/ sam_flag::none,
334 /*mapq*/ 0,
335 /*mate*/ default_mate_t{},
336 /*tag_dict*/ sam_tag_dictionary{},
337 /*e_value*/ 0,
338 /*bit_score*/ 0);
339}
340
342template <typename stream_type, // constraints checked by file
343 typename seq_legal_alph_type,
344 typename ref_seqs_type,
345 typename ref_ids_type,
346 typename stream_pos_type,
347 typename seq_type,
348 typename id_type,
349 typename ref_seq_type,
350 typename ref_id_type,
351 typename ref_offset_type,
352 typename cigar_type,
353 typename flag_type,
354 typename mapq_type,
355 typename qual_type,
356 typename mate_type,
357 typename tag_dict_type,
358 typename e_value_type,
359 typename bit_score_type>
360inline void
362 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
363 ref_seqs_type & ref_seqs,
365 stream_pos_type & position_buffer,
366 seq_type & seq,
367 qual_type & qual,
368 id_type & id,
369 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
370 ref_id_type & ref_id,
371 ref_offset_type & ref_offset,
372 cigar_type & cigar_vector,
373 flag_type & flag,
374 mapq_type & mapq,
375 mate_type & mate,
376 tag_dict_type & tag_dict,
377 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
378 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
379{
380 static_assert(detail::decays_to_ignore_v<ref_offset_type>
381 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
382 "The ref_offset must be a specialisation of std::optional.");
383
384 auto stream_it = detail::fast_istreambuf_iterator{*stream.rdbuf()};
385
386 auto stream_view = detail::istreambuf(stream);
387
388 int32_t ref_offset_tmp{}; // needed to read the ref_offset (int) beofre storing it in std::optional<uint32_t>
389 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{}; // in case mate is requested but ref_offset not
390
391 // Header
392 // -------------------------------------------------------------------------------------------------------------
393 if (is_char<'@'>(*stream_it)) // we always read the header if present
394 {
395 read_header(stream_view, header, ref_seqs);
396
397 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
398 return;
399 }
400
401 // Store the current file position in the buffer.
402 position_buffer = stream.tellg();
403
404 // We don't know wether we have 11 or 12 fields, since the tags are optional.
405 // The last field will thus contain either the quality sequence
406 // or the quality sequence AND tags. This will be handled at the respective fields below.
407 stream_it.cache_record_into('\n', '\t', raw_record);
408
409 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
410 // -------------------------------------------------------------------------------------------------------------
411 if constexpr (!detail::decays_to_ignore_v<id_type>)
412 read_forward_range_field(raw_record[0], id);
413
414 uint16_t flag_integral{};
415 read_arithmetic_field(raw_record[1], flag_integral);
416 flag = sam_flag{flag_integral};
417
418 read_forward_range_field(raw_record[2], ref_id_tmp);
419 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
420
421 read_arithmetic_field(raw_record[3], ref_offset_tmp);
422 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
423
424 if (ref_offset_tmp == -1)
425 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
426 else if (ref_offset_tmp > -1)
427 ref_offset = ref_offset_tmp;
428 else if (ref_offset_tmp < -1)
429 throw format_error{"No negative values are allowed for field::ref_offset."};
430
431 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
432 read_arithmetic_field(raw_record[4], mapq);
433
434 // Field 6: CIGAR
435 // -------------------------------------------------------------------------------------------------------------
436 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
437 cigar_vector = detail::parse_cigar(raw_record[5]);
438
439 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
440 // -------------------------------------------------------------------------------------------------------------
441 if constexpr (!detail::decays_to_ignore_v<mate_type>)
442 {
443 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
444 read_forward_range_field(raw_record[6], tmp_mate_ref_id); // RNEXT
445
446 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
447 {
448 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
449 get<0>(mate) = ref_id;
450 else
451 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
452 }
453 else
454 {
455 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
456 }
457
458 int32_t tmp_pnext{};
459 read_arithmetic_field(raw_record[7], tmp_pnext); // PNEXT
460
461 if (tmp_pnext > 0)
462 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
463 else if (tmp_pnext < 0)
464 throw format_error{"No negative values are allowed at the mate mapping position."};
465 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
466
467 read_arithmetic_field(raw_record[8], get<2>(mate)); // TLEN
468 }
469
470 // Field 10: Sequence
471 // -------------------------------------------------------------------------------------------------------------
472 if constexpr (!detail::decays_to_ignore_v<seq_type>)
473 {
474 std::string_view const seq_str = raw_record[9];
475
476 if (!seq_str.starts_with('*')) // * indicates missing sequence information
477 {
478 seq.resize(seq_str.size());
479 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
480
481 for (size_t i = 0; i < seq_str.size(); ++i)
482 {
483 if (!is_legal_alph(seq_str[i]))
484 throw parse_error{std::string{"Encountered an unexpected letter: "} + "char_is_valid_for<"
485 + detail::type_name_as_string<seq_legal_alph_type>
486 + "> evaluated to false on " + detail::make_printable(seq_str[i])};
487
488 seq[i] = assign_char_to(seq_str[i], std::ranges::range_value_t<seq_type>{});
489 }
490 }
491 }
492
493 // Field 11: Quality
494 // -------------------------------------------------------------------------------------------------------------
495 // We don't know wether we have 11 or 12 fields, since the tags are optional.
496 // The last field will thus contain either the quality sequence
497 // or the quality sequence AND tags.
498 size_t tag_begin_pos = raw_record[10].find('\t');
499
500 std::string_view qualities =
501 (tag_begin_pos == std::string_view::npos) ? raw_record[10] : raw_record[10].substr(0, tag_begin_pos);
502
503 if constexpr (!detail::decays_to_ignore_v<qual_type>)
504 read_forward_range_field(qualities, qual);
505
506 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
507 {
508 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0
509 && std::ranges::distance(seq) != std::ranges::distance(qual))
510 {
511 throw format_error{detail::to_string("Sequence length (",
512 std::ranges::distance(seq),
513 ") and quality length (",
514 std::ranges::distance(qual),
515 ") must be the same.")};
516 }
517 }
518
519 // All remaining optional fields if any: SAM tags dictionary
520 // -------------------------------------------------------------------------------------------------------------
521 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
522 {
523 while (tag_begin_pos != std::string_view::npos) // read all tags if present
524 {
525 ++tag_begin_pos; // skip '\t'
526 size_t const tag_end_pos = raw_record[10].find('\t', tag_begin_pos);
527
528 char const * tag_begin = raw_record[10].begin() + tag_begin_pos;
529 char const * tag_end =
530 (tag_end_pos == std::string_view::npos) ? raw_record[10].end() : raw_record[10].begin() + tag_end_pos;
531
532 read_sam_dict(std::string_view{tag_begin, tag_end}, tag_dict);
533
534 tag_begin_pos = tag_end_pos;
535 }
536 }
537
538 assert(stream_it == std::default_sentinel_t{} || *stream_it == '\n');
539 ++stream_it; // Move from end of record to the beginning of the next or to the end of the stream.
540}
541
543template <typename stream_type,
544 typename header_type,
545 typename seq_type,
546 typename id_type,
547 typename ref_seq_type,
548 typename ref_id_type,
549 typename qual_type,
550 typename mate_type,
551 typename tag_dict_type,
552 typename e_value_type,
553 typename bit_score_type>
554inline void format_sam::write_alignment_record(stream_type & stream,
555 sam_file_output_options const & options,
556 header_type && header,
557 seq_type && seq,
558 qual_type && qual,
559 id_type && id,
560 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
561 ref_id_type && ref_id,
563 std::vector<cigar> const & cigar_vector,
564 sam_flag const flag,
565 uint8_t const mapq,
566 mate_type && mate,
567 tag_dict_type && tag_dict,
568 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
569 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
570{
571 /* Note the following general things:
572 *
573 * - Given the SAM specifications, all fields may be empty
574 *
575 * - arithmetic values default to 0 while all others default to '*'
576 *
577 * - Because of the former, arithmetic values can be directly streamed
578 * into 'stream' as operator<< is defined for all arithmetic types
579 * and the default value (0) is also the SAM default.
580 *
581 * - All other non-arithmetic values need to be checked for emptiness
582 */
583
584 // ---------------------------------------------------------------------
585 // Type Requirements (as static asserts for user friendliness)
586 // ---------------------------------------------------------------------
587 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
588 "The seq object must be a std::ranges::forward_range over "
589 "letters that model seqan3::alphabet.");
590
591 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
592 "The id object must be a std::ranges::forward_range over "
593 "letters that model seqan3::alphabet.");
594
595 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
596 {
597 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
598 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
599 "The ref_id object must be a std::ranges::forward_range "
600 "over letters that model seqan3::alphabet.");
601
602 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
603 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
604 static_assert(!detail::decays_to_ignore_v<header_type>,
605 "If you give indices as reference id information the header must also be present.");
606 }
607
608 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
609 "The qual object must be a std::ranges::forward_range "
610 "over letters that model seqan3::alphabet.");
611
613 "The mate object must be a std::tuple of size 3 with "
614 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
615 "2) a std::integral or std::optional<std::integral>, and "
616 "3) a std::integral.");
617
618 static_assert(
619 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
620 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
621 || detail::is_type_specialisation_of_v<
622 std::remove_cvref_t<decltype(std::get<0>(mate))>,
623 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
624 || detail::is_type_specialisation_of_v<
625 std::remove_cvref_t<decltype(std::get<1>(mate))>,
626 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
627 "The mate object must be a std::tuple of size 3 with "
628 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
629 "2) a std::integral or std::optional<std::integral>, and "
630 "3) a std::integral.");
631
632 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
633 || detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>,
635 static_assert(!detail::decays_to_ignore_v<header_type>,
636 "If you give indices as mate reference id information the header must also be present.");
637
638 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
639 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
640
641 // ---------------------------------------------------------------------
642 // logical Requirements
643 // ---------------------------------------------------------------------
644 if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
645 && !std::integral<std::remove_reference_t<ref_id_type>>
646 && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
647 {
648
649 if (options.sam_require_header && !std::ranges::empty(ref_id))
650 {
651 auto id_it = header.ref_dict.end();
652
653 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> && std::ranges::sized_range<decltype(ref_id)>
654 && std::ranges::borrowed_range<decltype(ref_id)>)
655 {
656 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
657 }
658 else
659 {
660 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
661
663 "The ref_id type is not convertible to the reference id information stored in the "
664 "reference dictionary of the header object.");
665
666 id_it = header.ref_dict.find(ref_id);
667 }
668
669 if (id_it == header.ref_dict.end()) // no reference id matched
670 throw format_error{detail::to_string("The ref_id '",
671 ref_id,
672 "' was not in the list of references:",
673 header.ref_ids())};
674 }
675 }
676
677 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
678 throw format_error{"The ref_offset object must be a std::integral >= 0."};
679
680 // ---------------------------------------------------------------------
681 // Writing the Header on first call
682 // ---------------------------------------------------------------------
683 if constexpr (!detail::decays_to_ignore_v<header_type>)
684 {
685 if (options.sam_require_header && !header_was_written)
686 {
687 write_header(stream, options, header);
688 header_was_written = true;
689 }
690 }
691
692 // ---------------------------------------------------------------------
693 // Writing the Record
694 // ---------------------------------------------------------------------
695
696 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
697 constexpr char separator{'\t'};
698
699 write_range_or_asterisk(stream_it, id);
700 *stream_it = separator;
701
702 stream_it.write_number(static_cast<uint16_t>(flag));
703 *stream_it = separator;
704
705 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
706 {
707 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
708 {
709 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
710 }
711 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
712 {
713 if (ref_id.has_value())
714 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
715 else
716 *stream_it = '*';
717 }
718 else
719 {
720 write_range_or_asterisk(stream_it, ref_id);
721 }
722 }
723 else
724 {
725 *stream_it = '*';
726 }
727
728 *stream_it = separator;
729
730 // SAM is 1 based, 0 indicates unmapped read if optional is not set
731 stream_it.write_number(ref_offset.value_or(-1) + 1);
732 *stream_it = separator;
733
734 stream_it.write_number(static_cast<unsigned>(mapq));
735 *stream_it = separator;
736
737 if (!std::ranges::empty(cigar_vector))
738 {
739 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
740 stream_it.write_range(c.to_string());
741 }
742 else
743 {
744 *stream_it = '*';
745 }
746
747 *stream_it = separator;
748
749 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
750 {
751 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
752 }
753 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>,
755 {
756 if (get<0>(mate).has_value())
757 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value()]);
758 else
759 *stream_it = '*';
760 }
761 else
762 {
763 write_range_or_asterisk(stream_it, get<0>(mate));
764 }
765
766 *stream_it = separator;
767
768 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
769 {
770 // SAM is 1 based, 0 indicates unmapped read if optional is not set
771 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
772 *stream_it = separator;
773 }
774 else
775 {
776 stream_it.write_number(get<1>(mate));
777 *stream_it = separator;
778 }
779
780 stream_it.write_number(get<2>(mate));
781 *stream_it = separator;
782
783 write_range_or_asterisk(stream_it, seq);
784 *stream_it = separator;
785
786 write_range_or_asterisk(stream_it, qual);
787
788 write_tag_fields(stream_it, tag_dict, separator);
789
790 stream_it.write_end_of_line(options.add_carriage_return);
791}
792
809template <arithmetic value_type>
810inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
811 std::string_view const str,
812 value_type value)
813{
814 std::vector<value_type> tmp_vector{};
815 size_t start_pos{0};
816 size_t end_pos{0};
817
818 while (start_pos != std::string_view::npos)
819 {
820 end_pos = str.find(',', start_pos);
821 auto end = (end_pos == std::string_view::npos) ? str.end() : str.begin() + end_pos;
822 read_arithmetic_field(std::string_view{str.begin() + start_pos, end}, value);
823 tmp_vector.push_back(value);
824
825 start_pos = (end_pos == std::string_view::npos) ? end_pos : end_pos + 1;
826 }
827 variant = std::move(tmp_vector);
828}
829
841inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, std::string_view const str)
842{
843 std::vector<std::byte> tmp_vector{};
844 // std::from_chars cannot directly parse into a std::byte
845 uint8_t dummy_byte{};
846
847 if (str.size() % 2 != 0)
848 throw format_error{"[CORRUPTED SAM FILE] Hexadecimal tag must have even number of digits."};
849
850 // H encodes bytes in a hexadecimal format. Two hex values are stored for each byte as characters.
851 // E.g., '1' and 'A' need one byte each and are read as `\x1A`, which is 27 in decimal.
852 for (auto hex_begin = str.begin(), hex_end = str.begin() + 2; hex_begin != str.end(); hex_begin += 2, hex_end += 2)
853 {
854 auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);
855
856 if (res.ec == std::errc::invalid_argument)
857 throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(hex_begin, hex_end)
858 + "' could not be cast into type uint8_t."};
859
860 if (res.ec == std::errc::result_out_of_range)
861 throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str)
862 + "' into type uint8_t would cause an overflow."};
863
864 tmp_vector.push_back(std::byte{dummy_byte});
865 }
866
867 variant = std::move(tmp_vector);
868}
869
885inline void format_sam::read_sam_dict(std::string_view const tag_str, sam_tag_dictionary & target)
886{
887 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
888 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
889 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
890 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
891 */
892 assert(tag_str.size() > 5);
893
894 uint16_t tag = static_cast<uint16_t>(tag_str[0]) << 8;
895 tag += static_cast<uint16_t>(tag_str[1]);
896
897 char type_id = tag_str[3];
898
899 switch (type_id)
900 {
901 case 'A': // char
902 {
903 assert(tag_str.size() == 6);
904 target[tag] = tag_str[5];
905 break;
906 }
907 case 'i': // int32_t
908 {
909 int32_t tmp;
910 read_arithmetic_field(tag_str.substr(5), tmp);
911 target[tag] = tmp;
912 break;
913 }
914 case 'f': // float
915 {
916 float tmp;
917 read_arithmetic_field(tag_str.substr(5), tmp);
918 target[tag] = tmp;
919 break;
920 }
921 case 'Z': // string
922 {
923 target[tag] = std::string{tag_str.substr(5)};
924 break;
925 }
926 case 'H':
927 {
928 read_sam_byte_vector(target[tag], tag_str.substr(5));
929 break;
930 }
931 case 'B': // Array. Value type depends on second char [cCsSiIf]
932 {
933 assert(tag_str.size() > 6);
934 char array_value_type_id = tag_str[5];
935
936 switch (array_value_type_id)
937 {
938 case 'c': // int8_t
939 read_sam_dict_vector(target[tag], tag_str.substr(7), int8_t{});
940 break;
941 case 'C': // uint8_t
942 read_sam_dict_vector(target[tag], tag_str.substr(7), uint8_t{});
943 break;
944 case 's': // int16_t
945 read_sam_dict_vector(target[tag], tag_str.substr(7), int16_t{});
946 break;
947 case 'S': // uint16_t
948 read_sam_dict_vector(target[tag], tag_str.substr(7), uint16_t{});
949 break;
950 case 'i': // int32_t
951 read_sam_dict_vector(target[tag], tag_str.substr(7), int32_t{});
952 break;
953 case 'I': // uint32_t
954 read_sam_dict_vector(target[tag], tag_str.substr(7), uint32_t{});
955 break;
956 case 'f': // float
957 read_sam_dict_vector(target[tag], tag_str.substr(7), float{});
958 break;
959 default:
960 throw format_error{std::string("The first character in the numerical ")
961 + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
962 + "' was given."};
963 }
964 break;
965 }
966 default:
967 throw format_error{std::string("The second character in the numerical id of a "
968 "SAM tag ([TAG]:[TYPE_ID]:[VALUE]) must be one of [A,i,Z,H,B,f] but '")
969 + type_id + "' was given."};
970 }
971}
972
980template <typename stream_it_t, std::ranges::forward_range field_type>
981inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
982{
983 if (std::ranges::empty(field_value))
984 {
985 *stream_it = '*';
986 }
987 else
988 {
989 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
990 stream_it.write_range(field_value);
991 else // convert from alphabets to their character representation
992 stream_it.write_range(field_value | views::to_char);
993 }
994}
995
1002template <typename stream_it_t>
1003inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1004{
1005 write_range_or_asterisk(stream_it, std::string_view{field_value});
1006}
1007
1015template <typename stream_it_t>
1016inline void
1017format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1018{
1019 auto const stream_variant_fn = [&stream_it](auto && arg) // helper to print a std::variant
1020 {
1021 using T = std::remove_cvref_t<decltype(arg)>;
1022
1023 if constexpr (std::ranges::input_range<T>)
1024 {
1025 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1026 {
1027 stream_it.write_range(arg);
1028 }
1029 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1030 {
1031 if (!std::ranges::empty(arg))
1032 {
1033 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1034
1035 for (auto && elem : arg | std::views::drop(1))
1036 {
1037 *stream_it = ',';
1038 stream_it.write_number(std::to_integer<uint8_t>(elem));
1039 }
1040 }
1041 }
1042 else
1043 {
1044 if (!std::ranges::empty(arg))
1045 {
1046 stream_it.write_number(*std::ranges::begin(arg));
1047
1048 for (auto && elem : arg | std::views::drop(1))
1049 {
1050 *stream_it = ',';
1051 stream_it.write_number(elem);
1052 }
1053 }
1054 }
1055 }
1056 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1057 {
1058 *stream_it = arg;
1059 }
1060 else // number
1061 {
1062 stream_it.write_number(arg);
1063 }
1064 };
1065
1066 for (auto & [tag, variant] : tag_dict)
1067 {
1068 *stream_it = separator;
1069
1070 char const char0 = tag / 256;
1071 char const char1 = tag % 256;
1072
1073 *stream_it = char0;
1074 *stream_it = char1;
1075 *stream_it = ':';
1076 *stream_it = detail::sam_tag_type_char[variant.index()];
1077 *stream_it = ':';
1078
1079 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1080 {
1081 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1082 *stream_it = ',';
1083 }
1084
1085 std::visit(stream_variant_fn, variant);
1086 }
1087}
1088
1089} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
The SAM format (tag).
Definition format_sam.hpp:108
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=delete
Deleted. Header holds a unique_ptr.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition format_sam.hpp:124
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition format_sam.hpp:313
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition format_sam.hpp:554
format_sam(format_sam &&)=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition format_sam.hpp:361
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition format_sam.hpp:267
format_sam(format_sam const &)=delete
Deleted. Header holds a unique_ptr.
format_sam()=default
Defaulted.
Stores the header information of SAM/BAM files.
Definition header.hpp:49
ref_ids_type & ref_ids()
The range of reference ids.
Definition header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition header.hpp:182
The SAM tag dictionary class that stores all optional SAM fields.
Definition sam_tag_dictionary.hpp:330
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition to_char.hpp:63
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:524
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ bit_score
The bit score (statistical significance indicator), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:63
constexpr auto is_space
Checks whether c is a space character.
Definition predicate.hpp:125
seqan::stl::ranges::to to
Converts a range to a container. <dl class="no-api">This entity is not part of the SeqAn API....
Definition to.hpp:26
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition type_list/traits.hpp:395
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
T push_back(T... args)
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
T size(T... args)
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition io/exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition io/exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition sam_file/output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition sam_file/output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition sam_file/output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition sequence_file/input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition sequence_file/input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition sequence_file/output_options.hpp:26
T substr(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::ranges::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)