SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <cstdio>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <stack>
20 #include <string>
21 #include <string_view>
22 #include <type_traits>
23 #include <vector>
24 
45 
46 namespace seqan3
47 {
85 {
86 public:
90  format_vienna() noexcept = default;
91  format_vienna(format_vienna const &) noexcept = default;
92  format_vienna & operator=(format_vienna const &) noexcept = default;
93  format_vienna(format_vienna &&) noexcept = default;
94  format_vienna & operator=(format_vienna &&) noexcept = default;
95  ~format_vienna() noexcept = default;
96 
98 
100  static inline std::vector<std::string> file_extensions
101  {
102  { "dbn" },
103  { "fasta" },
104  { "fa" }
105  };
106 
107 protected:
109  template <typename stream_type, // constraints checked by file
110  typename seq_legal_alph_type,
111  bool structured_seq_combined,
112  typename seq_type, // other constraints checked inside function
113  typename id_type,
114  typename bpp_type,
115  typename structure_type,
116  typename energy_type,
117  typename react_type,
118  typename comment_type,
119  typename offset_type>
120  void read_structure_record(stream_type & stream,
122  seq_type & seq,
123  id_type & id,
124  bpp_type & bpp,
125  structure_type & structure,
126  energy_type & energy,
127  react_type & SEQAN3_DOXYGEN_ONLY(react),
128  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
129  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
130  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
131  {
132  auto stream_view = detail::istreambuf(stream);
133 
134  // READ ID (if present)
135  auto constexpr is_id = is_char<'>'>;
136  if (is_id(*begin(stream_view)))
137  {
138  if constexpr (!detail::decays_to_ignore_v<id_type>)
139  {
140  if (options.truncate_ids)
141  {
142  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
143  | detail::take_until_or_throw(is_cntrl || is_blank)
144  | views::char_to<std::ranges::range_value_t<id_type>>,
145  std::cpp20::back_inserter(id));
146  detail::consume(stream_view | detail::take_line_or_throw);
147  }
148  else
149  {
150  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
151  | detail::take_line_or_throw
152  | views::char_to<std::ranges::range_value_t<id_type>>,
153  std::cpp20::back_inserter(id));
154  }
155  }
156  else
157  {
158  detail::consume(stream_view | detail::take_line_or_throw);
159  }
160  }
161  else if constexpr (!detail::decays_to_ignore_v<id_type>)
162  {
163  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
164  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
165  {
166  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
167  is_id.msg + " and char_is_valid_for<" +
168  detail::type_name_as_string<seq_legal_alph_type> + ">" +
169  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
170  }
171  }
172 
173  // READ SEQUENCE
174  if constexpr (!detail::decays_to_ignore_v<seq_type>)
175  {
176  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
177  std::ranges::copy(stream_view | detail::take_line_or_throw // until end of line
178  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
179  | std::views::transform([is_legal_seq](char const c)
180  {
181  if (!is_legal_seq(c)) // enforce legal alphabet
182  {
183  throw parse_error{std::string{"Encountered an unexpected letter: "} +
184  "char_is_valid_for<" +
185  detail::type_name_as_string<seq_legal_alph_type> +
186  "> evaluated to false on " +
187  detail::make_printable(c)};
188  }
189  return c;
190  })
191  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
192  std::cpp20::back_inserter(seq));
193  }
194  else
195  {
196  detail::consume(stream_view | detail::take_line_or_throw);
197  }
198 
199  // READ STRUCTURE (if present)
200  [[maybe_unused]] int64_t structure_length{};
201  if constexpr (!detail::decays_to_ignore_v<structure_type>)
202  {
203  if constexpr (structured_seq_combined)
204  {
205  assert(std::addressof(seq) == std::addressof(structure));
206  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
207  // We need the structure_length parameter to count the length of the structure while reading
208  // because we cannot infer it from the (already resized) structure_seq object.
209  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
210  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
211 
212  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
213  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
214  }
215  else
216  {
217  using alph_type = std::ranges::range_value_t<structure_type>;
218  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
219  structure_length = std::ranges::distance(structure);
220 
221  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
222  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
223  }
224  }
225  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
226  {
227  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
228  structure_length = std::ranges::distance(bpp);
229  }
230  else
231  {
232  detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
233  }
234 
235  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
236  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
237  {
238  if (std::ranges::distance(seq) != structure_length)
239  throw parse_error{"Found sequence and associated structure of different length."};
240  }
241 
242  // READ ENERGY (if present)
243  if constexpr (!detail::decays_to_ignore_v<energy_type>)
244  {
245  std::string e_str = stream_view | detail::take_line
246  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
247  | views::to<std::string>;
248  if (!e_str.empty())
249  {
250  size_t num_processed;
251  energy = std::stod(e_str, &num_processed);
252  if (num_processed != e_str.size()) // [[unlikely]]
253  {
254  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
255  }
256  }
257  }
258  else
259  {
260  detail::consume(stream_view | detail::take_line);
261  }
262  detail::consume(stream_view | detail::take_until(!is_space));
263  }
264 
266  template <typename stream_type, // constraints checked by file
267  typename seq_type, // other constraints checked inside function
268  typename id_type,
269  typename bpp_type,
270  typename structure_type,
271  typename energy_type,
272  typename react_type,
273  typename comment_type,
274  typename offset_type>
275  void write_structure_record(stream_type & stream,
276  structure_file_output_options const & options,
277  seq_type && seq,
278  id_type && id,
279  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
280  structure_type && structure,
281  energy_type && energy,
282  react_type && SEQAN3_DOXYGEN_ONLY(react),
283  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
284  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
285  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
286  {
287  std::cpp20::ostreambuf_iterator stream_it{stream};
288 
289  // WRITE ID (optional)
290  if constexpr (!detail::decays_to_ignore_v<id_type>)
291  {
292  if (!std::ranges::empty(id))
293  {
294  stream_it = '>';
295  stream_it = ' ';
296  std::ranges::copy(id, stream_it);
297  detail::write_eol(stream_it, options.add_carriage_return);
298  }
299  }
300 
301  // WRITE SEQUENCE
302  if constexpr (!detail::decays_to_ignore_v<seq_type>)
303  {
304  if (std::ranges::empty(seq)) //[[unlikely]]
305  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
306 
307  std::ranges::copy(seq | views::to_char, stream_it);
308  detail::write_eol(stream_it, options.add_carriage_return);
309  }
310  else
311  {
312  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
313  "when writing Vienna files."};
314  }
315 
316  // WRITE STRUCTURE (optional)
317  if constexpr (!detail::decays_to_ignore_v<structure_type>)
318  {
319  if (!std::ranges::empty(structure))
320  std::ranges::copy(structure | views::to_char, stream_it);
321 
322  // WRITE ENERGY (optional)
323  if constexpr (!detail::decays_to_ignore_v<energy_type>)
324  {
325  if (energy)
326  {
327 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
328 // auto [endptr, ec] = std::to_chars(str.data(),
329 // str.data() + str.size(),
330 // energy,
331 // std::chars_format::fixed,
332 // options.precision);
333 // if (ec == std::errc())
334 // std::ranges::copy(str.data(), endptr, stream_it);
335 // else
336 // throw std::runtime_error{"The energy could not be transformed into a string."};
337 
338  stream_it = ' ';
339  stream_it = '(';
340 
341  std::array<char, 100> str;
342  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
343  if (len < 0 || len >= 100)
344  throw std::runtime_error{"The energy could not be transformed into a string."};
345  std::ranges::copy(str.data(), str.data() + len, stream_it);
346 
347  stream_it = ')';
348  }
349  }
350  detail::write_eol(stream_it, options.add_carriage_return);
351  }
352  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
353  {
354  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
355  }
356  }
357 
358 private:
365  template <typename alph_type, typename stream_view_type>
366  auto read_structure(stream_view_type & stream_view)
367  {
368  auto constexpr is_legal_structure = char_is_valid_for<alph_type>;
369  return stream_view | detail::take_until(is_space) // until whitespace
370  | std::views::transform([is_legal_structure](char const c)
371  {
372  if (!is_legal_structure(c))
373  {
374  throw parse_error{
375  std::string{"Encountered an unexpected letter: char_is_valid_for<"} +
376  detail::type_name_as_string<alph_type> +
377  "> evaluated to false on " + detail::make_printable(c)};
378  }
379  return c;
380  }) // enforce legal alphabet
381  | views::char_to<alph_type>; // convert to actual target alphabet
382  }
383 };
384 
385 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Core alphabet concept and free function/type trait wrappers.
Provides seqan3::views::char_to.
Provides seqan3::views::to_char.
Provides alphabet adaptations for standard char types.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:85
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:101
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:120
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
Provides C++20 additions to the <iterator> header.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Definition: affine_cell_proxy.hpp:438
Adaptations of concepts from the Ranges TS.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
[DEPRECATED] Provides seqan3::views::take.
Provides traits to inspect some information of a type, for example its name.
Provides C++20 additions to the type_traits header.
Provides character predicates for tokenisation.
Provides seqan3::views::to.
Provides the WUSS format for RNA structure.