libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.cpp
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 *
19 * This program is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 * You should have received a copy of the GNU General Public License
30 * along with this program. If not, see <http://www.gnu.org/licenses/>.
31 */
32
33#include <QString>
34#include <QStringRef>
35#include <algorithm>
36#include "semiglobalalignment.h"
40#include "correctiontree.h"
43
44void
46{
47 peaks.clear();
48 m_peptideModel.reset();
49 score = 0;
50 begin_shift = 0.0;
51 end_shift = 0.0;
52 shifts.clear();
53 SPC = 0;
54 beginning = 0;
55 end = 0;
56}
57
58
59QString
60pappso::specpeptidoms::Alignment::getPeptideString(const QString &protein_sequence) const
61{
62 return protein_sequence.mid(beginning, end - beginning);
63}
64
65double
67{
68 double sum_of_elems = std::accumulate(shifts.begin(), shifts.end(), 0);
69 return begin_shift + sum_of_elems + end_shift;
70}
71
72
73std::size_t
78
79
81 const ScoreValues &score_values,
82 const pappso::PrecisionPtr precision_ptr,
83 const pappso::AaCode &aaCode)
84 : m_scorevalues(score_values), m_aaCode(aaCode)
85{
86 m_precision_ptr = precision_ptr;
87
88 KeyCell key_cell_init_first;
89 key_cell_init_first.n_row = 0;
90 key_cell_init_first.score = 0;
91 key_cell_init_first.beginning = 0;
92 key_cell_init_first.tree_id = 0;
93 m_interest_cells.push_back(key_cell_init_first);
94}
95
96const std::vector<pappso::specpeptidoms::KeyCell> &
101
102void
104 const SpOMSProtein *protein_ptr)
105{
106 // m_scenario.clear();
107 // TODO don't forget to reset any important variable
108 // m_best_alignment.reset();
109 // m_best_corrected_alignment.reset();
110 // m_best_post_processed_alignment.reset();
111
112 std::size_t sequence_length = protein_ptr->size();
113
114 KeyCell key_cell_init;
115 key_cell_init.n_row = 0;
116 key_cell_init.score = m_scorevalues.get(ScoreType::init);
117 key_cell_init.beginning = 0;
118 key_cell_init.tree_id = 0;
119
120 m_interest_cells.resize(spectrum.size());
121 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
122
123 m_interest_cells.at(0).score = 0;
124
125 // m_location_saver.resetLocationSaver();
126 m_updated_cells.clear();
127 for(std::size_t row_number = 1; row_number <= sequence_length; row_number++)
128 {
129 updateAlignmentMatrix(*protein_ptr,
130 row_number,
131 spectrum.getAaPositions(protein_ptr->at(row_number - 1).code),
132 spectrum,
133 true,
134 protein_ptr);
135 }
136}
137
138void
140 const SpOMSProtein *protein_ptr,
141 const std::size_t beginning,
142 const std::size_t length)
143{
144 try
145 {
146 qDebug();
147 const QString &protein_seq = protein_ptr->getSequence();
148 std::size_t length2;
149 if((qsizetype)(beginning + length) <= protein_seq.size())
150 {
151 length2 = length;
152 }
153 else
154 {
155 length2 = protein_seq.size() - beginning;
156 }
157
158 qDebug();
159 QString sequence_str = protein_seq.sliced(protein_seq.size() - beginning - length2, length2);
160
161 SpOMSProtein sequence("sub_sequence", sequence_str, m_aaCode);
162
163 // std::reverse(sequence.begin(), sequence.end());
164 std::vector<AaPosition> aa_positions;
165 CorrectionTree correction_tree;
166
167 qDebug();
168 m_scenario.reserve(length2 + 1, spectrum.size());
169 m_interest_cells.reserve(spectrum.size());
170 m_interest_cells.at(0).n_row = 0;
171 m_interest_cells.at(0).score = 0;
172 m_interest_cells.at(0).beginning = 0;
173 m_interest_cells.at(0).tree_id = 0;
174 for(std::size_t i = 1; i < m_interest_cells.size(); i++)
175 {
176 m_interest_cells.at(i).n_row = 0;
178 m_interest_cells.at(i).beginning = 0;
179 m_interest_cells.at(i).tree_id = 0;
180 }
181 qDebug();
182 for(std::size_t iter = m_interest_cells.size(); iter < spectrum.size(); iter++)
183 {
184 m_interest_cells.push_back({0, m_scorevalues.get(ScoreType::init), 0, 0});
185 }
186 qDebug();
187 m_scenario.resetScenario();
188 qDebug();
189 for(std::size_t row_number = 1; row_number <= length2; row_number++)
190 {
191
192 qDebug() << "row_number - 1=" << row_number - 1 << " sequence.size()=" << sequence.size();
193 // aa = Aa(sequence[row_number - 1].unicode());
194 updateAlignmentMatrix(sequence,
195 row_number,
196 spectrum.getAaPositions(sequence[row_number - 1].code),
197 spectrum,
198 false,
199 protein_ptr);
200 }
201 qDebug();
202 saveBestAlignment(sequence, spectrum, protein_seq.size() - beginning);
203
204 qDebug() << m_scenario.getBestScore() << " " << MIN_ALIGNMENT_SCORE;
205 // Correction : if complementary peaks are used, corrected spectra without one of the two
206 // peaks are generated and aligned. The best alignment is kept.
207 if(m_scenario.getBestScore() >
208 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
209 {
210 // we only correct alignment if the sequence has a minimum amino acid diversity
211 if(checkSequenceDiversity(sequence.getSequence(), 5, 2))
212 {
213
214 qDebug();
216 for(std::size_t iter : m_best_alignment.peaks)
217 {
218 if(iter > spectrum.getComplementaryPeak(iter))
219 {
220 break;
221 }
222 else if(std::find(m_best_alignment.peaks.begin(),
223 m_best_alignment.peaks.end(),
224 spectrum.getComplementaryPeak(iter)) !=
225 m_best_alignment.peaks.end())
226 {
227 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
228 }
229 }
230 qDebug();
231 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
232 if(corrections.size() > 0)
233 {
234 m_best_alignment.score =
235 0; // Reset the best alignment score (we dont want to keep
236 // the original alignment if corrections are needed)
237 qDebug();
238 for(auto peaks_to_remove : corrections)
239 {
240 qDebug();
241 correctAlign(sequence,
242 protein_ptr,
243 spectrum,
244 peaks_to_remove,
245 protein_seq.size() - beginning);
246 qDebug();
247 }
248 qDebug();
250 }
251 }
252 }
253 else
254 {
255 // this sequence has too much redundancy
256 // we have to lower the score
257 m_best_alignment.score = 0;
258 }
259 qDebug();
260 }
261 catch(const pappso::PappsoException &err)
262 {
264 QObject::tr("SemiGlobalAlignment::preciseAlign failed :\n%1").arg(err.qwhat()));
265 }
266}
267
268void
270 const SpOMSProtein *protein_ptr,
271 const SpOMSSpectrum &spectrum,
272 std::vector<std::size_t> &peaks_to_remove,
273 std::size_t offset)
274{
275 std::vector<AaPosition> aa_positions;
276 CorrectionTree correction_tree;
277 std::vector<std::size_t> final_peaks_to_remove;
278
279 KeyCell key_cell_init;
280 key_cell_init.beginning = 0;
281 key_cell_init.n_row = 0;
282 key_cell_init.score = m_scorevalues.get(ScoreType::init);
283 key_cell_init.tree_id = 0;
284
285 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
286
287 m_interest_cells.at(0).score = 0;
288
289 m_scenario.resetScenario();
290 qDebug();
291 for(qsizetype row_number = 1; row_number <= sequence.size(); row_number++)
292 {
293 qDebug() << row_number - 1 << " " << sequence.size();
294 qDebug() << "sequence[row_number - 1].aa" << (char)sequence[row_number - 1].aa;
295 qDebug();
296 aa_positions = spectrum.getAaPositions(sequence[row_number - 1].code, peaks_to_remove);
297 qDebug();
298 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, false, protein_ptr);
299 qDebug();
300 }
301
302 qDebug();
303 // Correction : if complementary peaks are used, corrected spectra without one of the two peaks
304 // are generated and aligned. The best alignment is kept.
305 qDebug() << m_scenario.getBestScore();
306 if(m_scenario.getBestScore() >
307 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
308 {
309 qDebug();
310 qDebug() << sequence.getSequence();
311 qDebug() << offset;
312 qDebug() << spectrum.getPrecursorCharge();
313 saveBestAlignment(sequence, spectrum, offset);
314 qDebug();
315 for(std::size_t iter : m_best_alignment.peaks)
316 {
317 qDebug() << "iter:" << iter << "comp:" << spectrum.getComplementaryPeak(iter);
318 if(iter == spectrum.getComplementaryPeak(iter))
319 {
320 continue;
321 }
322 else if(iter > spectrum.getComplementaryPeak(iter))
323 {
324 break;
325 }
326 else if(std::find(m_best_alignment.peaks.begin(),
327 m_best_alignment.peaks.end(),
328 spectrum.getComplementaryPeak(iter)) != m_best_alignment.peaks.end())
329 {
330 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
331 }
332 }
333 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
334 if(corrections.size() > 0)
335 {
336 for(auto new_peaks_to_remove : corrections)
337 {
338 final_peaks_to_remove = std::vector<std::size_t>(new_peaks_to_remove);
339 final_peaks_to_remove.insert(
340 final_peaks_to_remove.end(), peaks_to_remove.begin(), peaks_to_remove.end());
341 correctAlign(sequence, protein_ptr, spectrum, final_peaks_to_remove, offset);
342 }
343 }
344 else if(m_scenario.getBestScore() > m_best_corrected_alignment.score)
345 {
347 }
348 }
349 qDebug();
350}
351
352void
354 const SpOMSProtein *protein_ptr,
355 std::size_t beginning,
356 std::size_t length,
357 const std::vector<double> &shifts)
358{
359 std::size_t current_SPC = m_best_alignment.SPC;
360 int current_best_score = m_best_alignment.score;
362 for(double precursor_mass_error : shifts)
363 {
364 SpOMSSpectrum corrected_spectrum(spectrum, precursor_mass_error);
365 preciseAlign(corrected_spectrum, protein_ptr, beginning, length);
367 {
369 }
370 }
371 if(m_best_post_processed_alignment.SPC > current_SPC &&
372 m_best_post_processed_alignment.score >= current_best_score)
373 {
375 }
376}
377
378void
381 const std::size_t row_number,
382 const std::vector<AaPosition> &aa_positions,
383 const SpOMSSpectrum &spectrum,
384 const bool fast_align,
385 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
386{
387 int where = 0;
388 try
389 {
390 int score_found, score_shift, best_score, alt_score, tree_id;
391 uint32_t condition; // FIXME : may be used uninitialised
392 std::size_t best_column, shift, beginning, missing_aas, length, perfect_shift_origin;
393 KeyCell *current_cell_ptr, *tested_cell_ptr;
394 AlignType alignment_type, temp_align_type;
395
396 double smallest_aa_mass = m_aaCode.getMass((std::uint8_t)1);
397
398 m_updated_cells.reserve(aa_positions.size());
399 where = 1;
400 // Computation of the threePeaks condition, see spomsspectrum.h for more details.
401 if(fast_align)
402 {
403 condition = 3;
404 if(row_number > 1)
405 {
406 qDebug() << (char)sequence.at(row_number - 2).aa;
407 qDebug() << "condition" << condition;
408 condition += 2 << sequence.at(row_number - 2).code;
409 qDebug();
410 qDebug() << "condition" << condition;
411 }
412 }
413 where = 2;
414 for(std::vector<AaPosition>::const_iterator aa_position = aa_positions.begin();
415 aa_position != aa_positions.end();
416 aa_position++)
417 {
418
419 where = 3;
420 if(((condition & aa_position->condition) != 0) ||
421 !fast_align) // Verification of the threePeaks condition (only during first alignment).
422 {
423 current_cell_ptr = &m_interest_cells.at(aa_position->r_peak);
424 if(spectrum.peakType(aa_position->r_peak) ==
426 {
427 score_found = m_scorevalues.get(ScoreType::foundDouble);
429 }
430 else
431 {
432 score_found = m_scorevalues.get(ScoreType::found);
433 score_shift = m_scorevalues.get(ScoreType::foundShift);
434 }
435
436 // not found case (always computed)
437 best_column = aa_position->r_peak;
438 best_score = current_cell_ptr->score + (row_number - current_cell_ptr->n_row) *
440 beginning = current_cell_ptr->beginning;
441 tree_id = current_cell_ptr->tree_id;
442 alignment_type = AlignType::notFound;
443
444 // found case (Can only happen if the left peak is supported)
445 if(aa_position->l_support)
446 {
447 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak);
448 if(aa_position->l_peak == 0)
449 {
450 alt_score = tested_cell_ptr->score + score_found;
451 }
452 else
453 {
454 if(tested_cell_ptr->n_row == row_number - 1)
455 {
456 alt_score = tested_cell_ptr->score +
457 (row_number - tested_cell_ptr->n_row - 1) *
459 score_found;
460 }
461 else
462 {
463 alt_score = tested_cell_ptr->score +
464 (row_number - tested_cell_ptr->n_row - 1) *
466 score_shift;
467 }
468 }
469 if(alt_score >= best_score)
470 {
471 alignment_type = AlignType::found;
472 best_score = alt_score;
473 best_column = aa_position->l_peak;
474 if(best_column == 0)
475 {
476 if(row_number < ALIGNMENT_SURPLUS)
477 {
478 beginning = 0;
479 }
480 else
481 {
482 beginning = std::max((std::size_t)(row_number - ALIGNMENT_SURPLUS),
483 (std::size_t)0);
484 }
485 if(fast_align)
486 {
487 tree_id = m_location_saver.getNextTree();
488 }
489 }
490 else
491 {
492 beginning = tested_cell_ptr->beginning;
493 tree_id = tested_cell_ptr->tree_id;
494 }
495 }
496 }
497
498 where = 4;
499 // generic shift case (all shifts are tested)
500 shift = 0;
501 while(shift < aa_position->next_l_peak)
502 {
503 tested_cell_ptr = &m_interest_cells.at(aa_position->next_l_peak - shift);
504 // verification saut parfait
505 if(perfectShiftPossible(sequence,
506 spectrum,
507 tested_cell_ptr->n_row,
508 row_number,
509 aa_position->next_l_peak - shift,
510 aa_position->r_peak) &&
512 {
513 alt_score = tested_cell_ptr->score +
514 (row_number - tested_cell_ptr->n_row - 1) *
516 score_found;
517 temp_align_type = AlignType::perfectShift;
518 }
519 else
520 {
521 alt_score = tested_cell_ptr->score +
522 (row_number - tested_cell_ptr->n_row - 1) *
524 score_shift;
525 temp_align_type = AlignType::shift;
526 }
527 if(alt_score > best_score)
528 {
529 alignment_type = temp_align_type;
530 best_score = alt_score;
531 best_column = aa_position->next_l_peak - shift;
532 beginning = tested_cell_ptr->beginning;
533 tree_id = tested_cell_ptr->tree_id;
534 }
535 shift++;
536 }
537
538 where = 5;
539 // case shift from column 0 (no penalties if all precedent amino acids are missed)
540 tested_cell_ptr = &m_interest_cells.at(0);
541 // verification saut parfait
542 if(aa_position->r_peak <= TOL_PEAKS_MISSING_FIRST_COLUMN)
543 {
544 perfect_shift_origin =
545 perfectShiftPossibleFrom0(sequence, spectrum, row_number, aa_position->r_peak);
546 }
547 else
548 {
549 perfect_shift_origin = row_number;
550 }
551
552 if(perfect_shift_origin != row_number)
553 {
554 alt_score = tested_cell_ptr->score + score_found;
555 temp_align_type = AlignType::perfectShift;
556 }
557 else
558 {
559 alt_score = tested_cell_ptr->score + score_shift;
560 temp_align_type = AlignType::shift;
561 }
562
563 where = 6;
564 if(alt_score > best_score)
565 {
566 alignment_type = temp_align_type;
567 best_score = alt_score;
568 best_column = 0;
569 missing_aas =
570 std::floor(spectrum.getMZShift(0, aa_position->l_peak) / smallest_aa_mass);
571 if(row_number < ALIGNMENT_SURPLUS + missing_aas)
572 {
573 beginning = 0;
574 }
575 else
576 {
577 beginning =
578 std::max((std::size_t)(row_number - missing_aas - ALIGNMENT_SURPLUS),
579 (std::size_t)0);
580 }
581 where = 7;
582 if(fast_align)
583 {
584 tree_id = m_location_saver.getNextTree();
585 }
586 }
587
588 where = 8;
589 if(best_column != aa_position->r_peak)
590 {
591 m_updated_cells.push_back(
592 {aa_position->r_peak, {row_number, best_score, beginning, tree_id}});
593 }
594
595 where = 9;
596 if(best_score > m_location_saver.getMinScore(tree_id) && fast_align)
597 {
598 length =
599 row_number - beginning + 1 +
600 std::ceil(spectrum.getMissingMass(aa_position->r_peak) / smallest_aa_mass) +
602 where = 10;
603 m_location_saver.addLocation(beginning, length, tree_id, best_score, protein_ptr);
604 }
605 else if(!fast_align)
606 {
607
608 where = 11;
609 if(alignment_type == AlignType::perfectShift && best_column == 0)
610 {
611 m_scenario.saveOrigin(row_number,
612 aa_position->r_peak,
613 perfect_shift_origin,
614 0,
615 best_score,
617 }
618 else
619 {
620 m_scenario.saveOrigin(row_number,
621 aa_position->r_peak,
622 m_interest_cells.at(best_column).n_row,
623 best_column,
624 best_score,
625 alignment_type);
626 }
627 }
628 }
629 }
630
631 where = 30;
632 // Update row number in column 0
633 m_updated_cells.push_back({0, {row_number, 0, 0, 0}});
634
635 // Save updated key cells in the matrix
636 while(m_updated_cells.size() > 0)
637 {
638 qDebug() << m_interest_cells.size() << " " << m_updated_cells.back().first;
639 m_interest_cells.at(m_updated_cells.back().first) = m_updated_cells.back().second;
640 m_updated_cells.pop_back();
641 }
642 where++;
643 }
644 catch(const std::exception &error)
645 {
647 QObject::tr("updateAlignmentMatrix failed std::exception :\n%1 %2")
648 .arg(where)
649 .arg(error.what()));
650 }
651 catch(const pappso::PappsoException &err)
652 {
654 QObject::tr("updateAlignmentMatrix failed :\n%1").arg(err.qwhat()));
655 }
656}
657
658bool
661 const SpOMSSpectrum &spectrum,
662 const std::size_t origin_row,
663 const std::size_t current_row,
664 const std::size_t l_peak,
665 const std::size_t r_peak) const
666{
667 try
668 {
669 double missing_mass = 0;
670 auto it_end = sequence.begin() + current_row;
671 for(auto iter = sequence.begin() + origin_row; (iter != it_end) && (iter != sequence.end());
672 iter++)
673 {
674 missing_mass += iter->mass; // Aa(iter->unicode()).getMass();
675 }
676 if(missing_mass > 0)
677 {
678 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
679 return mz_range.contains(spectrum.getMZShift(l_peak, r_peak));
680 }
681 else
682 {
683 return false;
684 }
685 }
686 catch(const std::exception &error)
687 {
689 QObject::tr("perfectShiftPossible failed std exception:\n%1").arg(error.what()));
690 }
691 catch(const pappso::PappsoException &err)
692 {
694 QObject::tr("perfectShiftPossible failed :\n%1").arg(err.qwhat()));
695 }
696}
697
698std::size_t
701 const SpOMSSpectrum &spectrum,
702 const std::size_t current_row,
703 const std::size_t r_peak) const
704{
705 std::size_t perfect_shift_origin = current_row;
706 double missing_mass = spectrum.getMZShift(0, r_peak);
707 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
708 double aa_mass = 0;
709 while(aa_mass < missing_mass && perfect_shift_origin > 0 && !mz_range.contains(aa_mass))
710 {
711 aa_mass += sequence.at(perfect_shift_origin - 1)
712 .mass; // Aa(sequence.at(perfect_shift_origin - 1).unicode()).getMass();
713 perfect_shift_origin--;
714 }
715 if(mz_range.contains(aa_mass))
716 {
717 return perfect_shift_origin;
718 }
719 else
720 {
721 return current_row;
722 }
723}
724
725std::size_t
728 const SpOMSSpectrum &spectrum,
729 std::size_t end_row,
730 std::size_t end_peak) const
731{
732 try
733 {
734 std::size_t perfect_shift_end = end_row + 1;
735 double missing_mass = spectrum.getMissingMass(end_peak);
736 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
737 double aa_mass = 0;
738 while(aa_mass < missing_mass && perfect_shift_end < (std::size_t)sequence.size() &&
739 !mz_range.contains(aa_mass))
740 {
741 aa_mass += sequence.at(perfect_shift_end - 1)
742 .mass; // Aa(sequence.at(perfect_shift_end - 1).unicode()).getMass();
743 perfect_shift_end++;
744 }
745 if(mz_range.contains(aa_mass))
746 {
747 return perfect_shift_end - 1;
748 }
749 else
750 {
751 return end_row;
752 }
753 }
754 catch(const pappso::PappsoException &err)
755 {
757 QObject::tr("perfectShiftPossibleEnd failed :\n%1").arg(err.qwhat()));
758 }
759}
760
764
770
776
777void
780 const SpOMSSpectrum &spectrum,
781 std::size_t offset)
782{
783 qDebug();
784 m_best_alignment.m_peptideModel.reset();
785 m_best_alignment.peaks.clear();
786 m_best_alignment.shifts.clear();
787 std::size_t previous_row; // FIXME : may be used uninitialised
788 std::size_t previous_column = 0;
789 std::size_t perfect_shift_end;
790 std::pair<std::vector<ScenarioCell>, int> best_alignment = m_scenario.getBestAlignment();
791 m_best_alignment.score = best_alignment.second;
792 std::vector<SpOMSAa> skipped_aa;
793 double skipped_mass;
794 // Retrieving beginning and end
795 if(best_alignment.first.front().previous_row > offset)
796 {
798 QString("best_alignment.first.front().previous_row > offset %1 %2")
799 .arg(offset)
800 .arg(best_alignment.first.front().previous_row));
801 }
802 if(best_alignment.first.back().previous_row > offset)
803 {
805 QString("best_alignment.first.back().previous_row > offset %1 %2")
806 .arg(offset)
807 .arg(best_alignment.first.back().previous_row));
808 }
809 m_best_alignment.beginning = offset - best_alignment.first.front().previous_row;
810 m_best_alignment.end = offset - best_alignment.first.back().previous_row - 1;
811
812 qDebug();
813 AminoAcidModel aa_model;
814 aa_model.m_massDifference = 0;
815 // Filling temp_interpretation and peaks vectors
816 for(auto cell : best_alignment.first)
817 {
818 switch(cell.alignment_type)
819 {
820 case AlignType::found:
821 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
822 aa_model.m_massDifference = 0;
823 aa_model.m_skipped = false;
824 m_best_alignment.m_peptideModel.push_back(aa_model);
825 if(previous_row > cell.previous_row + 1)
826 {
827 skipped_mass = sequence.at(previous_row - 1)
828 .mass; // Aa(sequence.at(previous_row - 1).unicode()).getMass();
829 skipped_aa =
830 sequence.sliced(cell.previous_row, previous_row - cell.previous_row - 1);
831 aa_model.m_massDifference = 0;
832 aa_model.m_skipped = true;
833 for(auto aa : skipped_aa)
834 {
835 aa_model.m_aminoAcid = aa.aa;
836 m_best_alignment.m_peptideModel.push_back(aa_model);
837 skipped_mass += aa.mass; // Aa(aa.unicode()).getMass();
838 }
839 m_best_alignment.m_peptideModel.back().m_massDifference =
840 spectrum.getMZShift(cell.previous_column, previous_column) - skipped_mass;
841 }
842 m_best_alignment.peaks.push_back(cell.previous_column);
843 break;
845 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
846 aa_model.m_massDifference = 0;
847 aa_model.m_skipped = true;
848 m_best_alignment.m_peptideModel.push_back(aa_model);
849 break;
850 case AlignType::shift:
851
852 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
853 aa_model.m_massDifference = spectrum.getMZShift(cell.previous_column, previous_column) -
854 aa_model.m_aminoAcid.getMass();
855 aa_model.m_skipped = false;
856 m_best_alignment.m_peptideModel.push_back(aa_model);
857 m_best_alignment.peaks.push_back(cell.previous_column);
858 m_best_alignment.shifts.push_back(
859 spectrum.getMZShift(cell.previous_column, previous_column) -
860 sequence.at(previous_row - 1).mass);
861 break;
863 m_best_alignment.peaks.push_back(cell.previous_column);
864 skipped_aa = sequence.sliced(cell.previous_row, previous_row - cell.previous_row);
865 std::reverse(skipped_aa.begin(), skipped_aa.end());
866 aa_model.m_massDifference = 0;
867 aa_model.m_skipped = false;
868 for(auto aa : skipped_aa)
869 {
870 aa_model.m_aminoAcid = aa.aa;
871 m_best_alignment.m_peptideModel.push_back(aa_model);
872 }
873 break;
874 case AlignType::init:
875 previous_row = cell.previous_row;
876 previous_column = cell.previous_column;
877 m_best_alignment.peaks.push_back(cell.previous_column);
878 break;
879 }
880 previous_row = cell.previous_row;
881 previous_column = cell.previous_column;
882 }
883 std::reverse(m_best_alignment.peaks.begin(), m_best_alignment.peaks.end());
884 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
885
886 qDebug();
887 // Compute begin_shift and end_shift
888 MzRange zero(0, m_precision_ptr);
889 m_best_alignment.begin_shift = spectrum.getMZShift(0, m_best_alignment.peaks.front());
890 m_best_alignment.end_shift = spectrum.getMissingMass(m_best_alignment.peaks.back());
891 if(zero.contains(m_best_alignment.end_shift))
892 {
893 m_best_alignment.end_shift = 0;
894 }
895
896 qDebug();
897 // Computing SPC
898 m_best_alignment.SPC = 0;
899 for(auto peak : m_best_alignment.peaks)
900 {
901 switch(spectrum.at(peak).type)
902 {
904 qDebug() << peak << "native";
905 m_best_alignment.SPC += 1;
906 break;
908 qDebug() << peak << "both";
909 m_best_alignment.SPC += 2;
910 break;
912 qDebug() << peak << "synthetic";
913 break;
915 qDebug() << peak << "symmetric";
916 m_best_alignment.SPC += 1;
917 break;
918 }
919 }
920
921 qDebug();
922 // Final check of the end shift
923 if(m_best_alignment.end_shift > 0)
924 {
925 perfect_shift_end = perfectShiftPossibleEnd(sequence,
926 spectrum,
927 best_alignment.first.front().previous_row,
928 m_best_alignment.peaks.back());
929 if(perfect_shift_end != best_alignment.first.front().previous_row)
930 {
931 skipped_aa =
932 sequence.sliced(best_alignment.first.front().previous_row,
933 perfect_shift_end - best_alignment.first.front().previous_row);
934 aa_model.m_massDifference = 0;
935 aa_model.m_skipped = true;
936 for(auto aa = skipped_aa.begin(); aa != skipped_aa.end(); aa++)
937 {
938 aa_model.m_aminoAcid = aa->aa;
939 m_best_alignment.m_peptideModel.push_back(aa_model);
940 }
941 m_best_alignment.beginning = offset - perfect_shift_end;
942 m_best_alignment.end_shift = 0;
943 }
944 else
945 {
947 }
948 }
949
950 qDebug();
951 // Writing final interpretation
952 if(m_best_alignment.end_shift > 0)
953 {
954 m_best_alignment.m_peptideModel.setNterShift(m_best_alignment.end_shift);
955 }
956
957 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
958 if(m_best_alignment.begin_shift > 0)
959 {
960 m_best_alignment.m_peptideModel.setCterShift(m_best_alignment.begin_shift);
961 }
962
963 m_best_alignment.m_peptideModel.setPrecursorMass(spectrum.getPrecursorMass());
964 qDebug();
965}
966
972
973std::vector<double>
975 const Alignment &alignment,
976 const QString &protein_seq)
977{
978 // qDebug() << protein_seq;
979 if(alignment.end > (std::size_t)protein_seq.size())
980 {
981 throw pappso::ExceptionOutOfRange(QString("alignment.end > protein_seq.size() %1 %2")
982 .arg(alignment.end)
983 .arg(protein_seq.size()));
984 }
985 std::vector<double> potential_mass_errors(alignment.shifts);
986 double shift = alignment.end_shift;
987 std::size_t index;
988 if(alignment.beginning > 0)
989 { // -1 on unsigned int makes it wrong
990 index = alignment.beginning - 1;
991 while(shift > 0 && index > 0)
992 {
993 potential_mass_errors.push_back(shift);
994 // qDebug() << " shift=" << shift << " index=" << index
995 // << " letter=" << protein_seq.at(index).toLatin1();
996 shift -= aa_code.getMass(
997 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
998 index--;
999 }
1000 }
1001
1002 // qDebug() << "second";
1003 shift = alignment.begin_shift;
1004 index = alignment.end + 1;
1005 while(shift > 0 && index < (std::size_t)protein_seq.size())
1006 {
1007 potential_mass_errors.push_back(shift);
1008 qDebug() << " shift=" << shift << " index=" << index
1009 << " letter=" << protein_seq.at(index).toLatin1();
1010 shift -= aa_code.getMass(
1011 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1012 index++;
1013 }
1014 // qDebug();
1015 return potential_mass_errors;
1016}
1017
1018bool
1020 std::size_t window,
1021 std::size_t minimum_aa_diversity)
1022{
1023 qDebug() << "sequence=" << sequence << " window=" << window
1024 << " minimum_aa_diversity=" << minimum_aa_diversity;
1025 if(sequence.size() < window)
1026 return false;
1027 auto it_begin = sequence.begin();
1028 auto it_end = sequence.begin() + window;
1029 QString window_copy(sequence.mid(0, window));
1030 while(it_end != sequence.end())
1031 {
1032 std::partial_sort_copy(it_begin, it_end, window_copy.begin(), window_copy.end());
1033
1034 qDebug() << window_copy;
1035 std::size_t uniqueCount =
1036 std::unique(window_copy.begin(), window_copy.end()) - window_copy.begin();
1037
1038 qDebug() << uniqueCount;
1039 if(uniqueCount < minimum_aa_diversity)
1040 return false;
1041 it_begin++;
1042 it_end++;
1043 }
1044 return true;
1045}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
double getMass(uint8_t aa_code) const
get the mass of the amino acid given its integer code the amino acid can bear some modification (if a...
Definition aacode.cpp:224
pappso_double getMass() const override
Definition aa.cpp:90
bool contains(pappso_double) const
Definition mzrange.cpp:116
virtual const QString & qwhat() const
std::vector< std::vector< std::size_t > > getPeaks() const
void addPeaks(std::size_t peak1, std::size_t peak2)
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak....
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
SemiGlobalAlignment(const ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, const AaCode &aaCode)
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::vector< SpOMSAa > sliced(std::size_t position, std::size_t length) const
double getMZShift(std::size_t l_peak, std::size_t r_peak) const
Returns the mz difference between two peaks.
uint getPrecursorCharge() const
Returns the spectrum's precursor's charge.
double getMissingMass(std::size_t peak) const
Returns the missing mass between a peak and the precursor's mass (shift at the end).
std::size_t getComplementaryPeak(std::size_t peak) const
const std::vector< AaPosition > & getAaPositions(std::uint8_t aa_code) const
Returns the list of aa_positions for a given amino acid code.
specglob::ExperimentalSpectrumDataPointType peakType(std::size_t indice) const
Returns the type of one of the spectrum's peaks.
@ synthetic
does not correspond to existing peak, for computational purpose
Definition types.h:85
@ both
both, the ion and the complement exists in the original spectrum
Definition types.h:83
@ symmetric
new peak : computed symmetric mass from a corresponding native peak
Definition types.h:81
const uint ALIGNMENT_SURPLUS(5)
const int MIN_ALIGNMENT_SCORE(15)
const uint TOL_PEAKS_MISSING(4)
const uint TOL_PEAKS_MISSING_FIRST_COLUMN(5)
const PrecisionBase * PrecisionPtr
Definition precision.h:122
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence