libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
mzmlconvert.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/cbor/mzcbor/mzmlconvert.cpp
3 * \date 19/11/2025
4 * \author Olivier Langella
5 * \brief convert mzML to mzcbor
6 */
7
8/*******************************************************************************
9 * Copyright (c) 2025 Olivier Langella <Olivier.Langella@universite-paris-saclay.fr>.
10 *
11 * This file is part of PAPPSOms-tools.
12 *
13 * PAPPSOms-tools is free software: you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation, either version 3 of the License, or
16 * (at your option) any later version.
17 *
18 * PAPPSOms-tools is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with PAPPSOms-tools. If not, see <http://www.gnu.org/licenses/>.
25 *
26 ******************************************************************************/
27
28#include "mzmlconvert.h"
29#include <QDebug>
30#include <zlib.h>
32#include "pappsomspp/config.h"
33#include "binarydataarray.h"
34#include "cvparam.h"
35
36
39 : mp_monitor(p_monitor), mp_cborWriter(p_output)
40{
42 m_elementToStoreInArray << "cv" << "userParam" << "cvParam" << "binaryDataArray" << "spectrum"
43 << "sourceFile"
44 << "referenceableParamGroup" << "software" << "instrumentConfiguration"
45 << "processingMethod" << "dataProcessing" << "scan" << "scanWindow"
46 << "precursor" << "selectedIon";
47
48
49 m_elementStash.clear();
51 m_doubleArray.clear();
52 m_runIdList.clear();
55}
56
60
61
62void
64{
65 writer->startMap();
66
67 writer->writeInformations(PAPPSOMSPP_NAME, PAPPSOMSPP_VERSION, "mzCBORindex", "mzMLconvert");
68
69 writer->append("from");
70 writer->append(m_uuid);
71
72 writer->append("runIdList");
73 writer->writeArray(m_runIdList);
74
75
76 std::vector<std::size_t> msrun_size;
77
78
79 writer->append("runSpectrumIndexList");
80 writer->startArray(m_runAndSpectrumOffsetList.size());
81 for(auto &spectrum_offset_list : m_runAndSpectrumOffsetList)
82 {
83 msrun_size.push_back(spectrum_offset_list.size());
84 writer->writeArray(spectrum_offset_list);
85 }
86 writer->endArray();
87
88
89 writer->append("runSpectrumNativeIdList");
90 writer->startArray(m_runAndSpectrumIdList.size());
91 for(auto &spectrum_id_list : m_runAndSpectrumIdList)
92 {
93 writer->writeArray(spectrum_id_list);
94 }
95 writer->endArray();
96
97 // TIC
98 bool size_ok = true;
99 for(std::size_t i = 0; i < msrun_size.size(); i++)
100 {
101 if(m_runAndSpectrumTotalIonCountList.at(i).size() != msrun_size.at(i))
102 {
103 size_ok = false;
104 }
105 }
106 if(size_ok)
107 {
108 writer->append("runSpectrumTotalIonCountList");
109 writer->startArray(m_runAndSpectrumIdList.size());
110 for(auto &spectrum_tic_list : m_runAndSpectrumTotalIonCountList)
111 {
112 writer->writeArray(spectrum_tic_list);
113 }
114 writer->endArray();
115 }
116
117 // retention time
118 size_ok = true;
119 for(std::size_t i = 0; i < msrun_size.size(); i++)
120 {
121 if(m_runAndSpectrumRtList.at(i).size() != msrun_size.at(i))
122 {
123 size_ok = false;
124 }
125 }
126 if(size_ok)
127 {
128
129 writer->append("runSpectrumRtList");
130 writer->startArray(m_runAndSpectrumRtList.size());
131 for(auto &spectrum_rt_list : m_runAndSpectrumRtList)
132 {
133 writer->writeArray(spectrum_rt_list);
134 }
135 writer->endArray();
136 }
137 else
138 {
139 // qFatal() << "wrong size";
140 }
141
142 // MS Level
143 size_ok = true;
144 for(std::size_t i = 0; i < msrun_size.size(); i++)
145 {
146 if(m_runAndSpectrumMsLevelList.at(i).size() != msrun_size.at(i))
147 {
148 size_ok = false;
149 }
150 }
151 if(size_ok)
152 {
153 writer->append("runSpectrumMsLevelList");
154 writer->startArray(m_runAndSpectrumMsLevelList.size());
155 for(auto &spectrum_mslevel_list : m_runAndSpectrumMsLevelList)
156 {
157 writer->writeArray(spectrum_mslevel_list);
158 }
159 writer->endArray();
160 }
161
162 writer->endMap();
163}
164
165
166void
168{
169 mp_cborWriter->startMap();
170 if(m_qxmlStreamReader.readNextStartElement())
171 {
172 qDebug() << m_qxmlStreamReader.name().toString();
173 if(m_qxmlStreamReader.name().toString() == "indexedmzML")
174 {
175 m_qxmlStreamReader.readNextStartElement();
176 }
177 if(m_qxmlStreamReader.name().toString() == "mzML")
178 {
179 // write mzCBOR header
180 mp_cborWriter->append("mzCBOR");
181 mp_cborWriter->startMap();
182
183 mp_cborWriter->append("mode");
184 mp_cborWriter->append(0);
185
186 mp_cborWriter->writeInformations(
187 PAPPSOMSPP_NAME, PAPPSOMSPP_VERSION, "mzCBOR", "mzMLconvert");
188
189 m_uuid = mp_cborWriter->getUuid();
190 mp_cborWriter->endMap();
191
192 mp_cborWriter->append(m_qxmlStreamReader.name().toString());
193
194 mp_cborWriter->startMap();
195 mp_cborWriter->append("xmlns");
196 mp_cborWriter->append(m_qxmlStreamReader.namespaceUri());
198 mp_cborWriter->endMap();
199
200
201 bool array_started = false;
202 QString last_element;
203 while(m_qxmlStreamReader.readNextStartElement())
204 {
205 qDebug();
206 insideElement(last_element, array_started);
207 last_element = m_qxmlStreamReader.name().toString();
208 qDebug();
209 }
210 }
211 else
212 {
213 m_qxmlStreamReader.raiseError(QObject::tr("Not an mzML input file"));
214 m_qxmlStreamReader.skipCurrentElement();
215 }
216 }
217 mp_cborWriter->endMap();
218}
219
220
221void
223{
224 // defaultArrayLength 1552
225
226 m_currentSpectrumSize = m_qxmlStreamReader.attributes().value("defaultArrayLength").toULongLong();
227}
228
229
230void
232{
233 // m_qxmlStreamReader.skipCurrentElement();
234 /*
235 *
236 <binaryDataArray encodedLength="9092">
237 <cvParam cvRef="MS" accession="MS:1000515" value="" name="intensity array"
238 unitAccession="MS:1000131" unitName="number of counts" unitCvRef="MS" /> <cvParam cvRef="MS"
239 accession="MS:1000523" value="" name="64-bit float" /> <cvParam cvRef="MS" accession="MS:1000574"
240 value="" name="zlib compression" /> <binary>*/
241 std::size_t count = m_qxmlStreamReader.attributes().value("count").toULongLong();
242
243 mp_cborWriter->append("binaryDataArray");
244 mp_cborWriter->startArray(count);
245 while(m_qxmlStreamReader.readNext() && !m_qxmlStreamReader.isEndElement())
246 {
247 if(m_qxmlStreamReader.isStartElement())
248 {
249 BinaryDataArray binary_data_array;
250 binary_data_array.fromMzml(m_qxmlStreamReader);
251 binary_data_array.toCbor(*mp_cborWriter);
252 // writeZlibDataArray();
253 }
254 }
255 mp_cborWriter->endArray();
256}
257
258
259void
260pappso::cbor::mzcbor::MzmlConvert::insideElement(QString &last_element_in, bool &array_started_in)
261{
262 m_elementStash.push_back(m_qxmlStreamReader.name().toString());
263
264
265 if(m_elementStash.back() == "spectrum")
266 {
267 // qDebug() << m_qxmlStreamReader.attributes().value("id").toString();
268 m_runAndSpectrumIdList.back().push_back(
269 m_qxmlStreamReader.attributes().value("id").toString());
271 // qFatal();
272 }
273
274
275 if(m_elementStash.back() == "binaryDataArrayList")
276 {
278 }
279
280 else
281 {
282 // stop an array ?
283 qDebug() << "current element=" << m_elementStash.back();
284 qDebug() << "last_element=" << last_element_in;
285 if(array_started_in && (last_element_in != m_qxmlStreamReader.name().toString()))
286 {
287 mp_cborWriter->endArray();
288 array_started_in = false;
289 }
290
291
292 // start an array ?
293 if(m_elementToStoreInArray.contains(m_elementStash.back()))
294 {
295 // start an array ?
296 if((!array_started_in) && (last_element_in != m_elementStash.back()))
297 {
298 mp_cborWriter->append(m_elementStash.back());
299 mp_cborWriter->startArray();
300
301 array_started_in = true;
302 }
303 }
304
305 if(m_elementStash.back() == "spectrum")
306 {
307 if(mp_cborWriter->device() != nullptr)
308 m_runAndSpectrumOffsetList.back().push_back(mp_cborWriter->device()->pos());
309 }
310
311
312 if(!array_started_in)
313 mp_cborWriter->append(m_elementStash.back());
314
315 bool array_started = false;
316
317
318 if(m_elementStash.back() == "cvParam")
319 {
320 // array_started = true;
321 qDebug() << m_qxmlStreamReader.name() << " "
322 << m_elementStash.at(m_elementStash.size() - 2);
323 CvParam cv_param;
325 cv_param.toCbor(*mp_cborWriter);
326 qDebug() << cv_param.name;
327
328 if(m_elementStash.at(m_elementStash.size() - 2) == "spectrum")
329 {
330 qDebug() << "cvparam in spectrum";
331 if(cv_param.accession == "MS:1000511")
332 {
333 m_runAndSpectrumMsLevelList.back().push_back(cv_param.getExpectedUint8());
334 qDebug() << m_runAndSpectrumMsLevelList.back().back();
335 }
336 else if(cv_param.accession == "MS:1000285")
337 { // TIC
338 m_runAndSpectrumTotalIonCountList.back().push_back(cv_param.getExpectedDouble());
339 }
340 }
341 else if(m_elementStash.at(m_elementStash.size() - 2) == "scan")
342 {
343 if(cv_param.accession == "MS:1000016")
344 { // rt
345 double rt = cv_param.getExpectedDouble();
346
347 if(cv_param.unitAccession == "UO:0000031")
348 {
349 // // minutes
350 rt = rt * 60;
351 }
352 m_runAndSpectrumRtList.back().push_back(rt);
353 }
354 }
355 qDebug();
356 }
357 else
358 {
359
360 mp_cborWriter->startMap();
362
363 QString last_element;
364
365 while(m_qxmlStreamReader.readNext() && !m_qxmlStreamReader.isEndElement())
366 {
367
368 if(m_qxmlStreamReader.isCharacters())
369 {
370 // clean content:
371 QStringView content = m_qxmlStreamReader.text().trimmed();
372 if((m_qxmlStreamReader.text().toString() == "\n") ||
373 (m_qxmlStreamReader.text().toString() == "\n\t"))
374 {
375 }
376 else
377 {
378 // text node
379 if(!content.isEmpty())
380 {
381 qDebug() << "text isCharacters" << content.mid(0, 10);
382 mp_cborWriter->append("@text@");
383 mp_cborWriter->append(content);
384 }
385 }
386 }
387 else if(m_qxmlStreamReader.isStartElement())
388 {
389 QString tmp_element = m_qxmlStreamReader.name().toString();
390 insideElement(last_element, array_started);
391 last_element = tmp_element;
392 }
393 }
394
395 if(array_started)
396 {
397 mp_cborWriter->endArray();
398 }
399
400 mp_cborWriter->endMap();
401 }
402 }
403
404 qDebug() << m_elementStash.back();
405 m_elementStash.pop_back();
406}
407
408
409void
411{
412 bool ok(false);
413 double d = value_str.toDouble(&ok);
414 if(ok)
415 {
416 if(value_str.contains('.'))
417 {
418 mp_cborWriter->append(d);
419 }
420 else
421 {
422 qint64 bigint = value_str.toLongLong(&ok);
423 if(ok)
424 {
425 mp_cborWriter->append(bigint);
426 }
427 }
428 }
429 else
430 {
431 mp_cborWriter->append(value_str);
432 }
433}
434
435
436void
437pappso::cbor::mzcbor::MzmlConvert::attributeListToCbor(const QXmlStreamAttributes &xml_attributes)
438{
439 for(auto &xml_attribute : xml_attributes)
440 {
441 qDebug() << xml_attribute.name() << " " << xml_attribute.value();
442 mp_cborWriter->append(xml_attribute.name());
443 attributeValueToCbor(xml_attribute.value());
444
445 if((m_elementStash.size() > 0) && (m_elementStash.back() == "run") &&
446 (xml_attribute.name() == "id"))
447 {
448 m_runAndSpectrumOffsetList.push_back(std::vector<qint64>());
449 m_runAndSpectrumIdList.push_back(std::vector<QString>());
450 m_runAndSpectrumMsLevelList.push_back(std::vector<std::uint8_t>());
451 m_runAndSpectrumRtList.push_back(std::vector<double>());
452 m_runAndSpectrumTotalIonCountList.push_back(std::vector<qint64>());
453 m_runIdList.push_back(xml_attribute.value().toString());
454 }
455 }
456}
457
458const std::vector<QString> &
464
465const std::vector<std::vector<qint64>> &
470
471const std::vector<std::vector<QString>> &
PSI BinaryDataArray object for mzML/mzCBOR.
void writeInformations(const QString &software_name, const QString &software_version, const QString &type, const QString &operation)
void writeArray(const std::vector< std::size_t > &int_list)
virtual void readStream() override
std::vector< QString > m_elementStash
Definition mzmlconvert.h:84
std::vector< std::vector< QString > > m_runAndSpectrumIdList
Definition mzmlconvert.h:93
std::vector< std::vector< qint64 > > m_runAndSpectrumTotalIonCountList
Definition mzmlconvert.h:94
void writeMzcborIndex(pappso::cbor::CborStreamWriter *writer) const
std::vector< std::vector< std::uint8_t > > m_runAndSpectrumMsLevelList
Definition mzmlconvert.h:95
const std::vector< std::vector< qint64 > > & getRunAndSpectrumOffsetList() const
void insideElement(QString &last_element, bool &array_started)
const std::vector< QString > & getRunIdList() const
std::vector< std::vector< double > > m_runAndSpectrumRtList
Definition mzmlconvert.h:96
void attributeListToCbor(const QXmlStreamAttributes &xml_attributes)
const std::vector< std::vector< QString > > & getRunAndSpectrumIdList() const
std::vector< QString > m_runIdList
Definition mzmlconvert.h:91
pappso::UiMonitorInterface * mp_monitor
Definition mzmlconvert.h:81
void attributeValueToCbor(const QStringView &value_str)
MzmlConvert(pappso::UiMonitorInterface *p_monitor, pappso::cbor::CborStreamWriter *p_output)
std::vector< std::vector< qint64 > > m_runAndSpectrumOffsetList
Definition mzmlconvert.h:92
pappso::cbor::CborStreamWriter * mp_cborWriter
Definition mzmlconvert.h:82
#define PAPPSOMSPP_VERSION
Definition config.h:6
#define PAPPSOMSPP_NAME
Definition config.h:5
PSI cvParam object for mzML/mzCBOR.
void fromMzml(QXmlStreamReader &reader)
void toCbor(CborStreamWriter &writer)
void fromMzml(QXmlStreamReader &reader)
Definition cvparam.cpp:113
void toCbor(CborStreamWriter &writer)
Definition cvparam.cpp:153
std::uint8_t getExpectedUint8() const
Definition cvparam.cpp:248
double getExpectedDouble() const
Definition cvparam.cpp:280