From 0068b5c0d20fd00bb8f811542bc907e0911a3dad Mon Sep 17 00:00:00 2001 From: Julianus Pfeuffer Date: Fri, 24 Feb 2023 14:00:04 +0100 Subject: [PATCH] fix many glossary terms (#331) * Update .github/workflows/flakeheaven.yml * added packages * fix many glossary terms * added blacken-docs * testing blacken-docs * testing blacken-docs * testing blacken-docs * testing action suggester * testing action suggester * testing action suggester * remove additional file commits * added run step for blacken-docs * testing action-suggester with flakeheaven * updated glossary * updated glossary references * added :chem: role * changed chem_role setup * merge from master branch * removed m/z & peak from glossary * removed additional lower level glossary terms * merged master branch * added missinge linenos * changed indentation * merge with master * chagend tsv separator to t * use other suggestion action * Update code-blocks-linting.yaml --------- Co-authored-by: matteopilz --- .github/workflows/code-blocks-linting.yaml | 6 +- .gitignore | 1 + docs/source/_ext/chemrole.py | 15 ++ docs/source/algorithms.rst | 3 +- docs/source/background.rst | 246 ++++++++++-------- docs/source/build_from_source.rst | 8 +- docs/source/centroiding.rst | 20 +- ...g.rst => charge_isotope_deconvolution.rst} | 55 ++-- docs/source/chemistry.rst | 119 ++++----- docs/source/chromatographic_analysis.rst | 30 +-- docs/source/conf.py | 16 +- docs/source/digestion.rst | 8 +- ...{GNPS_export.rst => export_files_GNPS.rst} | 14 +- ...ersion.rst => export_pandas_dataframe.rst} | 28 +- docs/source/faq.rst | 6 +- docs/source/feature_detection.rst | 40 +-- docs/source/feature_linking.rst | 18 +- docs/source/first_steps.rst | 53 ++-- ...r.rst => fragment_spectrum_generation.rst} | 59 +++-- docs/source/glossary.rst | 175 ++++++------- ...z.rst => identification_accurate_mass.rst} | 60 ++--- ...uctures_id.rst => identification_data.rst} | 20 +- docs/source/index.rst | 68 +++-- docs/source/interactive_plots.rst | 10 +- ...orial.rst => interfacing_ml_libraries.rst} | 34 ++- docs/source/introduction.rst | 44 ++-- docs/source/map_alignment.rst | 12 +- docs/source/mass_decomposition.rst | 18 +- docs/source/memory_management.rst | 6 +- .../{datastructures_peak.rst => ms_data.rst} | 90 +++---- .../{mzMLFileFormat.rst => mzml_files.rst} | 24 +- ...sequences.rst => oligonucleotides_rna.rst} | 10 +- ...handling.rst => other_ms_data_formats.rst} | 16 +- docs/source/parameter_handling.rst | 2 + docs/source/peptide_search.rst | 44 ++-- ...{aasequences.rst => peptides_proteins.rst} | 44 ++-- docs/source/pyopenms_in_r.rst | 32 +-- .../{mzqc_export.rst => quality_control.rst} | 1 + ...ctures_quant.rst => quantitative_data.rst} | 30 +-- ...ssql.rst => query_msexperiment_massql.rst} | 24 +- ...e_handling.rst => reading_raw_ms_data.rst} | 36 +-- ...ore.rst => scoring_spectra_hyperscore.rst} | 20 +- docs/source/smoothing.rst | 4 +- ...umalignment.rst => spectrum_alignment.rst} | 18 +- ...ization.rst => spectrum_normalization.rst} | 12 +- docs/source/support.rst | 4 +- ...untargeted_metabolomics_preprocessing.rst} | 18 +- ...rst => wrapping_workflows_new_classes.rst} | 24 +- 48 files changed, 858 insertions(+), 787 deletions(-) create mode 100644 docs/source/_ext/chemrole.py rename docs/source/{deisotoping.rst => charge_isotope_deconvolution.rst} (71%) rename docs/source/{GNPS_export.rst => export_files_GNPS.rst} (77%) rename docs/source/{pandas_df_conversion.rst => export_pandas_dataframe.rst} (95%) rename docs/source/{theoreticalspectrumgenerator.rst => fragment_spectrum_generation.rst} (75%) rename docs/source/{id_by_mz.rst => identification_accurate_mass.rst} (86%) rename docs/source/{datastructures_id.rst => identification_data.rst} (93%) rename docs/source/{ML_tutorial.rst => interfacing_ml_libraries.rst} (92%) rename docs/source/{datastructures_peak.rst => ms_data.rst} (86%) rename docs/source/{mzMLFileFormat.rst => mzml_files.rst} (91%) rename docs/source/{nasequences.rst => oligonucleotides_rna.rst} (94%) rename docs/source/{other_file_handling.rst => other_ms_data_formats.rst} (93%) rename docs/source/{aasequences.rst => peptides_proteins.rst} (88%) rename docs/source/{mzqc_export.rst => quality_control.rst} (99%) rename docs/source/{datastructures_quant.rst => quantitative_data.rst} (85%) rename docs/source/{massql.rst => query_msexperiment_massql.rst} (80%) rename docs/source/{file_handling.rst => reading_raw_ms_data.rst} (89%) rename docs/source/{hyperscore.rst => scoring_spectra_hyperscore.rst} (81%) rename docs/source/{spectrumalignment.rst => spectrum_alignment.rst} (86%) rename docs/source/{normalization.rst => spectrum_normalization.rst} (68%) rename docs/source/{metabolomics_preprocessing.rst => untargeted_metabolomics_preprocessing.rst} (90%) rename docs/source/{wrap_classes.rst => wrapping_workflows_new_classes.rst} (96%) diff --git a/.github/workflows/code-blocks-linting.yaml b/.github/workflows/code-blocks-linting.yaml index f108e017d..afcb568f8 100644 --- a/.github/workflows/code-blocks-linting.yaml +++ b/.github/workflows/code-blocks-linting.yaml @@ -32,7 +32,11 @@ jobs: exit 1 fi - - name: Run action-suggester + - name: Run action-suggester (only works on lines that were changes in this PR) uses: reviewdog/action-suggester@v1 with: tool_name: blacken-docs + + - name: Display diff and fail if there was anything changed by blacken-docs + run: | + git diff --diff-algorithm histogram --exit-code diff --git a/.gitignore b/.gitignore index 89fafec68..b66b3385f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ docs/source/_autosummary/ docs/source/_build/ docs/build/ +docs/source/_autosummary diff --git a/docs/source/_ext/chemrole.py b/docs/source/_ext/chemrole.py new file mode 100644 index 000000000..f3c84daaf --- /dev/null +++ b/docs/source/_ext/chemrole.py @@ -0,0 +1,15 @@ +from docutils.nodes import math + + +def chem_role(name, rawtext, text, lineno, inliner, options={}, content=[]): + latex = rf'\ce{{{text}}}' + node = math(rawtext, latex, **options) + return [node], [] + + +def setup(app): + """Install the plugin. + :param app: Sphinx application context. + """ + app.add_role('chem', chem_role) + return {'parallel_read_safe': True} diff --git a/docs/source/algorithms.rst b/docs/source/algorithms.rst index 93cf72dfc..98fabeb14 100644 --- a/docs/source/algorithms.rst +++ b/docs/source/algorithms.rst @@ -34,9 +34,10 @@ pattern are :py:class:`~.GaussFilter`, :py:class:`~.SavitzkyGolayFilter` as well :py:class:`~.ParentPeakMower`, :py:class:`~.Scaler`, :py:class:`~.SpectraMerger`, :py:class:`~.SqrtMower`, :py:class:`~.ThresholdMower`, :py:class:`~.WindowMower`. -Using the same example file as before, we can execute a GaussFilter on our test data as follows: +Using the same example file as before, we can execute a :py:class:`~.GaussFilter` on our test data as follows: .. code-block:: python + :linenos: from pyopenms import * from urllib.request import urlretrieve diff --git a/docs/source/background.rst b/docs/source/background.rst index 0b1ae5f9f..00f6f74f4 100644 --- a/docs/source/background.rst +++ b/docs/source/background.rst @@ -1,79 +1,79 @@ -Background -========== +Introduction +============ -Proteomics and metabolomics focus on complex interactions within biological systems; the former is centered on proteins while the latter is based on metabolites. To understand these interactions, we need to accurately identify the different biological components involved. -:term:`Liquid chromatography` (LC) and :term:`mass spectrometry` (MS) are the analytical techniques used to isolate and identify biological components in proteomics and metabolomics. :term:`LC-MS` data can be difficult to analyze manually given its amount and complexity. Therefore, we need specialized software that can analyze high-throughput LC-MS data quickly and accurately. +Proteomics and metabolomics focus on complex interactions within biological systems; the former is centered on proteins while the latter is based on metabolites. To understand these interactions, we need to accurately identify the different biological components involved. +:term:`Liquid chromatography` (:term:`LC`) and mass spectrometry (MS) are the analytical techniques used to isolate and identify biological components in proteomics and metabolomics. :term:`LC-MS` data can be difficult to analyze manually given its amount and complexity. Therefore, we need specialized software that can analyze high-throughput :term:`LC-MS` data quickly and accurately. Why use OpenMS --------------- -OpenMS is an open-source, C++ framework for analyzing large volumes of :term:`mass spectrometry` data. +============== +OpenMS is an open-source, C++ framework for analyzing large volumes of mass spectrometry data. It has been specially designed for analyzing high performance :term:`LC-MS` data but over recent times, has been extended to analyze data generated by other techniques. .. note:: - OpenMS in recent times has been expanded to support a wide variety of :term:`mass spectrometry` experiments. + OpenMS in recent times has been expanded to support a wide variety of mass spectrometry experiments. To design your analysis solution, `contact the OpenMS team `_ today. -To use OpenMS effectively, an understanding of chromatography and :term:`mass spectrometry` is required as many of the -algorithms are based on these techniques. This section provides a detailed explanation on LC and MS, and how they are -combined to identify and quantify substances. +To use OpenMS effectively, an understanding of :term:`liquid chromatography` (:term:`LC`) and +mass spectrometry (MS) is required as many of the algorithms are based on these techniques. This +section provides a detailed explanation on :term:`LC` and MS, and how they are combined to identify and +quantify substances. -:term:`Liquid chromatography (LC)` ------------------------------------------ +:term:`Liquid chromatography` (:term:`LC`) +================================================================= -Chromatography is a technique used by life scientists to separate molecules based on a specific physical or -chemical property. +:term:`Liquid chromatography` is a technique used by life scientists to separate molecules based +on a specific physical or chemical property. .. raw:: html

Video

- For more information on chromatography, + For more information on :term:`LC`, view this video.
-There are many types of chromatography, but this section focuses on LC as it is widely used in -proteomics and metabolomics. +There are many types of chromatography, but this section focuses on :term:`LC` as it is widely used in proteomics and +metabolomics. -LC separates molecules based on a specific physical or chemical property by mixing a sample containing the molecules of -interest (otherwise known as **analytes**) in a liquid solution. +:term:`LC` separates molecules based on a specific physical or chemical property by mixing a sample containing the +molecules of interest (otherwise known as **analytes**) in a liquid solution. -Key components of LC -````````````````````` -An LC setup is made up of the following components: +Key components of :term:`LC` +```````````````````````````` +A :term:`LC` setup is made up of the following components: -* **A liquid solution**, known as the **mobile phase**, containing the analytes. +* **A liquid solution**, known as the **mobile phase**, containing the analytes. * **A pump** which transports the liquid solution. * **A stationary phase** which is a solid, homogeneous substance. -* **A column** that contains the stationary phase. -* **A detector** that plots the time it takes for the analyte to escape the column (:term:`retention time`) against the - analyte's concentration. This plot is called a **chromatogram**. +* **A column** that contains the stationary phase. +* **A detector** that plots the time it takes for the analyte to escape the column (retention time) against the analyte's concentration. This plot is called a chromatogram. -Refer to the image below for a diagrammatic representation of an LC setup. +Refer to the image below for a diagrammatic representation of a :term:`LC` setup. .. image:: img/introduction/lc-components.png -How does LC work? -````````````````` +How does :term:`LC` work? +````````````````````````` The liquid solution containing the analytes is pumped through a column that is attached to the stationary phase. Analytes are separated based on how strongly they interact with each phase. Some analytes will interact strongly with the mobile phase while others will be strongly attracted to the stationary phase, depending on their physical or chemical properties. The stronger an analyte's attraction is to the mobile phase, the faster it will leave the column. -The time it takes for an analyte to escape from the column is called the analyte's :term:`retention time`. +The time it takes for an analyte to escape from the column is called the analyte's retention time (RT). As a result of their differing attractions to the mobile and stationary phases, different analytes will have different -retention times, which is how separation occurs. +RTs, which is how separation occurs. -The retention times for each analyte are recorded by a detector. The most common detector used is the mass spectrometer, -which we discuss later. However, other detection methods exist, such as: +The RTs for each analyte are recorded by a detector. The most common detector used is the +mass spectrometer, which we discuss later. However, other detection methods exist, such as: * Light absorption (photometric detector) * Fluorescence * Change in diffraction index -High performance liquid chromatography (HPLC) -````````````````````````````````````````````` +High Performance :term:`Liquid Chromatography` (HPLC) +```````````````````````````````````````````````````````````````````````````` HPLC is the most commonly used technique for separating proteins and metabolites. In HPLC, a high-pressured pump is used to transport a liquid (solvent) containing the molecules of interest through a thin capillary column. The stationary phase is ‘packed’ into the column. @@ -91,10 +91,10 @@ Several variations of HPLC exist such as: - Affinity chromatography - Size exclusion chromatography -Special case of HPLC: Reversed-phase (RP) chromatography +Special Case of HPLC: Reversed-Phase (RP) Chromatography ::::::::::::::::::::::::::::::::::::::::::::::::::::::::: -RP chromatography is the most commony type of HPLC with biological samples. In reversed-phase liquid chromatography, +RP chromatography is the most commony type of HPLC with biological samples. In reversed-phase :term:`liquid chromatography`, the solid phase is modified to become hydrophobic, when it is originally hydrophilic, hence the term ‘reversed-phase’. The liquid phase is a mixture of water and an organic solvent. The separation of molecules happens based on the following behavior: hydrophilic analytes have a high affinity to the mobile phase and escape the column quickly @@ -109,50 +109,50 @@ escape the column. view this video. -Mass spectrometry (MS) ----------------------- +Mass Spectrometry (MS) +========================================================= -Mass spectrometry is an analytical technique used to determine the abundance of molecules in a sample. +Mass spectrometry is an analytical technique used to determine the abundance of molecules in a sample. Key components of MS -````````````````````` +```````````````````````````` There are three key components in a mass spectrometer: -* An **ion source**, which generates :term:`ions ` from the incoming sample. All mass spectrometry techniques rely +* An **ion source**, which generates ions from the incoming sample. All MS techniques rely on ionized molecules to control their movement in an electric field. -* A **mass analyzer**, which separates the :term:`ions ` according to their mass-to-charge (m/z) ratio. - There are several types such as time of flight (TOF), orbitrap and quadrupole mass analyzers. +* A **mass analyzer**, which separates the ions according to their mass-to-charge (m/z) ratio. + There are several types such as time of flight (:term:`tandem mass spectrometry`), :term:`orbitrap` and :term:`quadrupole` mass analyzers. Depending on the mass analyzer, OpenMS offers calibration tools, so that highly accurate results can be achieved. -* A **detector**, which scans ions at a given time point producing a :term:`mass spectrum`, where the intensity is +* A **detector**, which scans ions at a given time point producing a mass spectrum, where the intensity is plotted against the m/z. Refer to the image below for a diagrammatic representation of the key components in MS. .. image:: img/introduction/mass-spectrometry-components.png -Ion source +Ion Source :::::::::: We want the analytes to move through the electrostatic and electromagnetic fields in the mass analyzer. -To achieve this objective, we need to convert them to :term:`ions ` by charging them. There are a number of +To achieve this objective, we need to convert them to ions by charging them. There are a number of ways to charge our analytes including: -* Electrospray Ionization (ESI) +* Electrospray Ionization (:term:`ESI`) * Matrix Assisted Laser Desorption/Ionization (MALDI) * Electron Impact Ionization (EI) -In proteomics and metabolomics, ESI and MALDI are used because they are soft ionization techniques. +In proteomics and metabolomics, :term:`ESI` and MALDI are used because they are soft ionization techniques. A soft ionization technique is one which charges analytes while keeping the molecules of interest largely intact, so that they can be characterized easily at a later stage. Hard ionization techniques such as EI shatter analytes in smaller fragments, making it difficult to characterize large molecules. -Given that OpenMS focuses on proteomic and metabolomic applications, we will describe ESI and MALDI in further detail. +Given that OpenMS focuses on proteomics and metabolomics applications, we will describe :term:`ESI` and MALDI in further detail. -Electrospray Ionization (ESI) -''''''''''''''''''''''''''''' +:term:`Electrospray Ionization` (:term:`ESI`) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' -ESI can be broken down into the following steps. +:term:`ESI` can be broken down into the following steps. 1. The sample is dissolved in a polar, volatile buffer. 2. The sample - dissolved in the buffer - is pumped through a thin, stainless steel capillary. @@ -160,7 +160,7 @@ ESI can be broken down into the following steps. 4. The aerosol is directed through regions of high vacuum until the droplets evaporate until only the charged molecules are left. 5. The particles are fed to the mass analyzer. -Refer to the image below for a diagrammatic representation of the steps in ESI. +Refer to the image below for a diagrammatic representation of the steps in :term:`ESI`. .. image:: img/introduction/electrospray-ionization.png @@ -199,18 +199,18 @@ Once the analytes have been charged by the ion source, we want to now sort the a A number of mass analyzers exists. These include: -* Quadrupole analyzer -* Time-of-Flight analyzer -* Orbitrap analyzer +* :term:`Quadrupole` analyzer +* :term:`Time-of-flight` analyzer +* :term:`Orbitrap` analyzer The next sections describe each analyzer type in detail. -Quadrupole -'''''''''' +:term:`Quadrupole` +'''''''''''''''''''''''''''''' -In a quadropole analyzer, you can set the quadropole voltage so that ions with a specific m/z ratio travel through. The oscillating electrostatic fields stabilize the flight path for the ions so that they can pass through the quadropole. Other ions will be accelerated out of the quadropole and will not make it to the end. +In a :term:`quadrupole` analyzer, you can set the :term:`quadrupole` voltage so that ions with a specific m/z ratio travel through. The oscillating electrostatic fields stabilize the flight path for the ions so that they can pass through the :term:`quadrupole`. Other ions will be accelerated out of the :term:`quadrupole` and will not make it to the end. -Refer to the image below for a diagrammatic representation of the quadrupole analyzer. +Refer to the image below for a diagrammatic representation of the :term:`quadrupole` analyzer. .. image:: img/introduction/quadrupole-analyzer.png @@ -218,15 +218,15 @@ Refer to the image below for a diagrammatic representation of the quadrupole ana

Video

- For more information on quadrupole analyzers, view this video. + For more information on :term:`quadrupole` analyzers, view this video.
-Time-of-Flight (TOF) -'''''''''''''''''''' +:term:`Time-of-Flight` (:term:`TOF`) +''''''''''''''''''''''''''''''''''''''''''''''''''''''''' -In a time-of-flight analyzer, ions are extracted from the ion source through an electrostatic field in pulses in a field-free drift zone. An electrostatic mirror called a reflectron reflects the ions back onto the next component of :term:`mass spectrometry`, the detector. The detector counts the particles and records the time of flight from extraction to the moment the particle hits the detector. +In a :term:`time-of-flight` analyzer, ions are extracted from the ion source through an electrostatic field in pulses in a field-free drift zone. An electrostatic mirror called a reflectron reflects the ions back onto the next component of mass spectrometry, the detector. The detector counts the particles and records the time of flight from extraction to the moment the particle hits the detector. -Refer to the image below for a diagrammatic representation of the TOF analyzer. +Refer to the image below for a diagrammatic representation of the :term:`TOF` analyzer. .. image:: img/introduction/TOF.png @@ -245,7 +245,7 @@ Lighter ions fly faster than heavier ions of the same charge and will arrive ear \begin{equation} E_p = E_k = \frac{1}{2}mv^2 \end{equation} -3. We know that for a given path,**s**, from extraction to the detector, the time of flight, **t** is equal to: +3. We know that for a given path,**s**, from extraction to the detector, the :term:`time of flight`, **t** is equal to: .. math:: @@ -266,10 +266,14 @@ Therefore,, **t**, for a given instrument's path length, **s**, depends on an io For more information on TOF analyzers, view this video. -Orbitrap -'''''''' +:term:`Orbitrap` +'''''''''''''''''''''''''' -The orbitrap analyzer is the most frequently used analyzer in :term:`mass spectrometry` for proteomic and metabolomic applications. It consists of two outer electrodes and a central electrode. Ions are captured inside the analyzer because of an applied electrostatic field. The ions in the orbitrap analyzer oscillate around the central electrode along the axis of the electrostatic field at a set frequency, ω. This frequency is used to determine the mass-to-charge ratio using the following formula: +The :term:`orbitrap` analyzer is the most frequently used analyzer in mass spectrometry for +proteomics and metabolomics applications. It consists of two outer electrodes and a central electrode. +Ions are captured inside the analyzer because of an applied electrostatic field. The ions in the :term:`orbitrap` +analyzer oscillate around the central electrode along the axis of the electrostatic field at a set frequency, ω. +This frequency is used to determine the mass-to-charge ratio using the following formula: .. math:: @@ -285,15 +289,18 @@ The orbitrap analyzer is the most frequently used analyzer in :term:`mass spectr For more information on orbitrap analyzers, view this video. -The following diagram is a conceptual representation of an orbitrap mass analyzer. +The following diagram is a conceptual representation of an :term:`orbitrap` mass analyzer. .. image:: img/introduction/orbitrap.png -Identifying molecules with Tandem Mass Spectrometry (MS2) -````````````````````````````````````````````````````````` -To get better results, we can use two mass analyzers sequentially to generate and analyze ions. This technique is called **tandem :term:`mass spectrometry`** or MS/MS (MS2). Tandem :term:`mass spectrometry` is especially useful for linear polymers like proteins, RNA and DNA. +Identifying Molecules with :term:`tandem mass spectrometry` (:term:`MS2`) +``````````````````````````````````````````````````````````````````````````````````````````````````` +To get better results, we can use two mass analyzers sequentially to generate and analyze ions. +This technique is called :term:`tandem mass spectrometry` :term:`MS2`. :term:`Tandem mass spectrometry` is +especially useful for linear polymers like proteins, RNA and DNA. -With MS2, ions called **precursor ions** are isolated and fragmented into ion fragments or **product ions**. A :term:`mass spectrum` is recorded for both the precursor and the product ions. +With :term:`MS2`, ions called **precursor ions** are isolated and fragmented into ion fragments or **product ions**. +A mass spectrum is recorded for both the precursor and the product ions. .. raw:: html @@ -304,18 +311,23 @@ With MS2, ions called **precursor ions** are isolated and fragmented into ion fr Different fragmentation techniques to fragment peptides exist: -- Collision-Induced Dissociation (CID) +- :term:`Collision-induced dissociation` (:term:`CID`) - Pulsed Q Dissociation (PQD) - Electron transfer dissociation (ETD) - Electron capture dissociation (ECD) - Higher energy collision dissociation (HCD) -CID is the most frequently used fragmentation technique and will therefore be discussed in more detail in the following section. +:term:`CID` is the most frequently used fragmentation technique and will therefore be discussed in more detail in the following section. -Collision-induced dissociation -:::::::::::::::::::::::::::::: +:term:`Collision-Induced Dissociation` +:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: -Collision-induced dissociation is a method to fragment peptides using an inert gas such as argon or helium. Selected primary or precursor ions enter a collision cell filled with the inert gas. The application of the inert gas on the precursor ions causes the precursor ions that reach the energy threshold to fragment into smaller, product ions and or neutral losses. A :term:`mass spectrum` is recorded for both the precursor ions and the product ions. The :term:`mass spectrum` for the precursor ions will give you the mass for the entire peptide while the product ions will inform you about it’s amino acid composition. +:term:`Collision-induced dissociation` is a method to fragment peptides using an +inert gas such as argon or helium. Selected primary or precursor ions enter a collision cell filled with the inert gas. +The application of the inert gas on the precursor ions causes the precursor ions that reach the energy threshold to +fragment into smaller, product ions and or neutral losses. A mass spectrum is recorded for both the precursor +ions and the product ions. The mass spectrum for the precursor ions will give you the mass for the entire +peptide while the product ions will inform you about it’s amino acid composition. .. raw:: html @@ -324,18 +336,31 @@ Collision-induced dissociation is a method to fragment peptides using an inert g For more information on CID, view this video. + :term:`LC-MS` -------------- +::::::::::::: -Liquid chromatography is often coupled with :term:`mass spectrometry` to reduce complexity in the mass spectra. If complex samples were directly fed to a mass spectrometer, you would not be able to detect the less abundant analyte ions. The separated analytes from the liquid chromatography setup are directly injected into the ion source from the :term:`mass spectrometry` setup. Multiple analytes that escape the column at the same time are separated by their mass-to-charge ratio using the mass spectrometer. +Liquid chromatography is often coupled with mass spectrometry to reduce complexity in the +mass spectra. If complex samples were directly fed to a mass spectrometer, +you would not be able to detect the less abundant analyte ions. +The separated analytes from the :term:`liquid chromatography` setup are directly injected into the ion source from +the mass spectrometry setup. Multiple analytes that escape the column at the same time +are separated by their mass-to-charge ratio using the mass spectrometer. Refer to the image below for a diagrammatic representation of the :term:`LC-MS` setup. .. image:: img/introduction/lc-ms-setup.png -From the :term:`LC-MS` setup, a set of spectra called a peak map is produced. In a peak map, each spectrum represents the ions detected at a particular retention time. Each peak in a spectrum has a retention time, mass-to-charge and intensity dimension. +From the :term:`LC-MS` setup, a set of spectra called a peak map is produced. In a peak map, +each spectrum represents the ions detected at a particular retention time. +Each peak in a spectrum has a retention time, :term`mass-to-charge` and intensity dimension. -From the :term:`LC-MS` setup, a series of spectra are 'stacked' together to form what is known as a peak map. Each spectrum in a peak map is a collection of data points called :term:`peaks ` which indicate the retention time, mass-to-charge and intensity of each detected ion. Analyzing peak maps is difficult as different compounds can elute at the same time which means that peaks can overlap. Therefore, sophisticated techniques are required for the accurate identification and quantification of molecules. +From the :term:`LC-MS` setup, a series of spectra are 'stacked' together to form what is known as a peak map. +Each spectrum in a peak map is a collection of data points called peaks which indicate the +retention time, mass-to-charge and intensity of each detected ion. +Analyzing peak maps is difficult as different compounds can elute at the same time which means that +peaks can overlap. Therefore, sophisticated techniques are required for the accurate identification +and quantification of molecules. The image below includes a spectrum at a given retention time (left) and a peak map (right). @@ -348,29 +373,39 @@ The image below includes a spectrum at a given retention time (left) and a peak For more information on a *specific* application of LC-MS, view this video. -Improving identification and quantification -------------------------------------------- -While the combination of liquid chromatography and :term:`mass spectrometry` can ease the process of characterising molecules of interest, further techniques are required to easily identify and quantify these molecules. This section discusses both labeled and label-free quantification techniques. +Improving Identification and Quantification +=========================================== + +While the combination of :term:`liquid chromatography` and mass spectrometry can ease the process of +characterising molecules of interest, further techniques are required to easily identify and quantify these molecules. +This section discusses both labeled and label-free quantification techniques. Labeling ```````` -Relative quantification is one strategy where one sample is chemically treated and compared to another sample without treatment. This section discusses a particular relative quanitification technique called **labeling** or **stable isotope labeling** which involves the addition of isotopes to one sample. An isotope of an element behaves the same chemically but has a different mass. Stable isotope labeling is used in :term:`mass spectrometry` so that scientists can easily identify proteins and metabolites. +Relative quantification is one strategy where one sample is chemically treated and compared to another sample +without treatment. This section discusses a particular relative quantification technique called **labeling** or +**stable isotope labeling** which involves the addition of isotopes to one sample. An isotope of an element behaves +the same chemically but has a different mass. Stable isotope labeling is used in mass spectrometry so that +scientists can easily identify proteins and metabolites. Two types of stable isotope labeling exist: chemical labeling and metabolic labeling. Chemical labeling ::::::::::::::::: -During chemical labeling, the label is attached at specific functional groups in a molecule like the N-terminus of a peptide or specific side chains. +During chemical labeling, the label is attached at specific functional groups in a molecule like the N-terminus of a +peptide or specific side chains. -Chemical labeling occurs late in the process, therefore experiments that incorporate this technique are not highly reproducible. +Chemical labeling occurs late in the process, therefore experiments that incorporate this technique are not highly +reproducible. Isobaric labeling ''''''''''''''''' -Isobaric labeling, is a technique where peptides and proteins are labeled with chemical groups that have an identical mass, but vary in terms of of distribution of heavy isotopes in their structure. +Isobaric labeling, is a technique where peptides and proteins are labeled with chemical groups that have an identical +mass, but vary in terms of of distribution of heavy isotopes in their structure. .. raw:: html @@ -390,14 +425,17 @@ OpenMS contains tools that analyze data from isobaric labeling experiments. Metabolic labeling :::::::::::::::::: -During metabolic labeling, the organism is 'fed' with labeled metabolites. Metabolites include but are not limited to amino acids, nitrogen sources and glucose. Unlike chemical labeling, metabolic labeling occurs early in the study. Therefore, experiments that incorporate metabolic labeling are highly reproducible. +During metabolic labeling, the organism is 'fed' with labeled metabolites. Metabolites include but are not limited to +amino acids, nitrogen sources and glucose. Unlike chemical labeling, metabolic labeling occurs early in the study. +Therefore, experiments that incorporate metabolic labeling are highly reproducible. -Stable Isotope Labeling with Amino Aids in Cell Culture (SILAC) -''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' +Stable Isotope Labeling with Amino Acids in Cell Culture (:term:`SILAC`) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' -In SILAC, the labeled amino acids are fed to the cell culture. The labels are integrated into the proteins after a period. The labeled sample is then compared with the unlabeled sample. +In :term:`SILAC`, the labeled amino acids are fed to the cell culture. The labels are integrated into the proteins after +a period. The labeled sample is then compared with the unlabeled sample. -OpenMS contains tools that analyze data from SILAC experiments. +OpenMS contains tools that analyze data from :term:`SILAC` experiments. .. raw:: html @@ -412,17 +450,18 @@ OpenMS contains tools that analyze data from SILAC experiments. Label-free quantification (LFQ) ``````````````````````````````` -LFQ is a cheap and natural method of quantifying molecules of interest. As the name suggests, no labeling of molecules is involved. +LFQ is a cheap and natural method of quantifying molecules of interest. As the name suggests, no labeling of molecules +is involved. LFQ includes the following steps: 1. **Conduct replicate experiments**. -2. **Generate :term:`LC-MS` maps** for each experiment. -3. **Find features** in all :term:`LC-MS` maps. A :term:`feature` is a collection of peaks that belong to a chemical compound. +2. **Generate** :term:`LC-MS` **maps** for each experiment. +3. **Find features** in all :term:`LC-MS` maps. A features is a collection of peaks that belong to a chemical compound. 4. **Align maps** to address shifts in retention times. 5. **Match corresponding features** in different maps. We refer to this as **grouping** or **linking**. -6. **Identify feature groups**, called :term:`consensus features `. -7. **Quantify consensus features**. +6. **Identify feature groups**, called :term:`consensus features`. +7. **Quantify consensus features**. .. raw:: html @@ -432,10 +471,11 @@ LFQ includes the following steps: For more information on the steps involved in LFQ, view this video. -Feature finding +Feature Finding ::::::::::::::: -Feature finding is method for identifying all peaks belonging to a chemical compound. Feature finding involves the following steps: +features finding is method for identifying all peaks belonging to a chemical compound. Feature finding +involves the following steps: 1. **Extension** where we collect all data points we think belong to the peptide. 2. **Refinement** where we remove peaks that we think do not belong to the peptide. diff --git a/docs/source/build_from_source.rst b/docs/source/build_from_source.rst index 4663ab563..d4cfa62f3 100644 --- a/docs/source/build_from_source.rst +++ b/docs/source/build_from_source.rst @@ -1,4 +1,4 @@ -Build from source +Build from Source ================== To install pyOpenMS from :index:`source`, you will first have to compile OpenMS @@ -12,7 +12,7 @@ following software packages On Microsoft Windows: you need the 64 bit C++ compiler from Visual Studio 2015 to compile the newest pyOpenMS for Python 3.5, 3.6 or 3.7. This is important, else you get a clib that is different than the one used for building the Python -executable, and pyOpenMS will crash on import. The OpenMS wiki has `detailed information +executable, and pyOpenMS will crash on import. The OpenMS wiki has `detailed information `_ on building pyOpenMS on Windows. @@ -43,7 +43,7 @@ that the correct Python executable is used. Compiling pyOpenMS can use a lot of memory and take some time, however you can reduce the memory consumption by breaking up the compilation into multiple units and compiling in parallel, for example ``-DPY_NUM_THREADS=2 -DPY_NUM_MODULES=4`` will build 4 modules with 2 -threads. You can then configure pyOpenMS: +threads. You can then configure pyOpenMS: .. code-block:: bash @@ -59,7 +59,7 @@ Afterwards, test that all went well by running the tests: Which should execute all the tests and return with all tests passing. -Further questions +Further Questions ***************** In case the above instructions did not work, please refer to the `Wiki Page diff --git a/docs/source/centroiding.rst b/docs/source/centroiding.rst index 91a7b0b2b..0c4719ee5 100644 --- a/docs/source/centroiding.rst +++ b/docs/source/centroiding.rst @@ -2,15 +2,15 @@ Centroiding =========== MS instruments typically allow storing spectra in profile mode (several data points per m/z peak) -or in the more codensed centroid mode (one data point per m/z peak). The process of converting -a profile spectrum into a centroided one is called peak centroiding or peak picking. +or in the more condensed centroid mode (one data point per m/z peak). The process of converting +a profile mass spectrum into a centroided one is called peak centroiding or peak picking. -Note: The term peak picking is ambiguous as it is also used for feature detection (i.e. 3D "peak" finding). +Note: The term peak picking is ambiguous as it is also used for features detection (i.e. 3D peak finding). First, we load some profile data: .. code-block:: python - + :linenos: from urllib.request import urlretrieve from pyopenms import * import matplotlib.pyplot as plt @@ -24,7 +24,7 @@ First, we load some profile data: Let's zoom in on an isotopic pattern in profile mode and plot it. .. code-block:: python - + :linenos: plt.xlim(771.8, 774) # zoom into isotopic pattern plt.plot( profile_spectra[0].get_peaks()[0], profile_spectra[0].get_peaks()[1] @@ -32,13 +32,13 @@ Let's zoom in on an isotopic pattern in profile mode and plot it. .. image:: img/profile_data.png -Because of the limited resolution of MS instruments m/z measurements are not of unlimited precision. -Consequently, peak shapes spreads in the m/z dimension and resemble a gaussian distribution. -Using the PeakPickerHiRes algorithm, we can convert data from profile to centroided mode. Usually, not much information is lost +Because of the limited resolution of MS instruments m/z measurements are not of unlimited precision. +Consequently, peak shapes spreads in the m/z dimension and resemble a gaussian distribution. +Using the :py:class:`~.PeakPickerHiRes` algorithm, we can convert data from profile to centroided mode. Usually, not much information is lost by storing only centroided data. Thus, many algorithms and tools assume that centroided data is provided. .. code-block:: python - + :linenos: centroided_spectra = MSExperiment() # input, output, chec_spectrum_type (if set, checks spectrum type and throws an exception if a centroided spectrum is passed) @@ -52,6 +52,6 @@ by storing only centroided data. Thus, many algorithms and tools assume that cen ) # plot as vertical lines .. image:: img/centroided_data.png -After centroding, a single m/z value for every isotopic peak is retained. By plotting the centroided data as stem plot +After centroiding, a single m/z value for every isotopic peak is retained. By plotting the centroided data as stem plot we discover that (in addition to the isotopic peaks) some low intensity peaks (intensity at approx. 4k) were present in the profile data. diff --git a/docs/source/deisotoping.rst b/docs/source/charge_isotope_deconvolution.rst similarity index 71% rename from docs/source/deisotoping.rst rename to docs/source/charge_isotope_deconvolution.rst index b63365187..7e6ac7f5e 100644 --- a/docs/source/deisotoping.rst +++ b/docs/source/charge_isotope_deconvolution.rst @@ -3,27 +3,28 @@ Charge and Isotope Deconvolution A single mass spectrum contains measurements of one or more analytes and the m/z values recorded for these analytes. Most analytes produce multiple signals -in the mass spectrometer, due to the natural abundance of carbon 13 (naturally -occurring at ca. 1% frequency) and the large amount of carbon atoms in most +in the mass spectrometer, due to the natural abundance of carbon :math:`13` (naturally +occurring at ca. :math:`1%` frequency) and the large amount of carbon atoms in most organic molecules, most analytes produce a so-called isotopic pattern with a -monoisotopic peak (all carbon are C12) and a first isotopic peak (exactly one -carbon atom is a C13), a second isotopic peak (exactly two atoms are C13) etc. +monoisotopic peak (all carbon are :chem:`^{12}C`) and a first isotopic peak (exactly one +carbon atom is a :chem:`^{13}C`), a second isotopic peak (exactly two atoms are :chem:`^{13}C`) etc. Note that also other elements can contribute to the isotope pattern, see the -`Chemistry section `_ for further details. +`chemistry section `_ for further details. In addition, each analyte may appear in more than one charge state and adduct -state, a singly charge analyte ``[M+H]+`` may be accompanied by a doubly -charged analyte ``[M+2H]++`` or a sodium adduct ``[M+Na]+``. In the case of a +state, a singly charge analyte :chem:`[M +H]+` may be accompanied by a doubly +charged analyte :chem:`[M +2H]++` or a sodium adduct :chem:`[M +Na]+`. In the case of a multiply charged peptide, the isotopic traces are spaced by ``PROTON_MASS / -charge_state`` which is often close to 0.5 m/z for doubly charged analytes, -0.33 m/z for triply charged analytes etc. Note: tryptic peptides often appear +charge_state`` which is often close to :math:`0.5\ m/z` for doubly charged analytes, +:math:`0.33\ m/z` for triply charged analytes etc. Note: tryptic peptides often appear at least doubly charged, while small molecules often carry a single charge but can have adducts other than hydrogen. -Single peak example -******************* +Single Peak Example +********************************* .. code-block:: python + :linenos: from pyopenms import * @@ -50,10 +51,11 @@ Note that the algorithm presented here as some heuristics built into it, such as assuming that the isotopic peaks will decrease after the first isotopic peak. This heuristic can be tuned by changing the parameter ``use_decreasing_model`` and ``start_intensity_check``. In this case, the -second isotopic peak is the highest in intensity and the +second isotopic peak is the highest in intensity and the ``start_intensity_check`` parameter needs to be set to 3. .. code-block:: python + :linenos: charge = 4 seq = AASequence.fromString("DFPIANGERDFPIANGERDFPIANGERDFPIANGER") @@ -93,7 +95,7 @@ second isotopic peak is the highest in intensity and the print(p.getMZ(), p.getIntensity()) -Full spectral de-isotoping +Full Spectral De-Isotoping ************************** In the following code segment, we will use a sample measurement of BSA (Bovine @@ -102,6 +104,7 @@ mass spectrum, which means grouping peaks of the same isotopic pattern charge state: .. code-block:: python + :linenos: from urllib.request import urlretrieve @@ -155,35 +158,35 @@ which produces the following output 974.4572680576728 6200571.5 974.4589691256419 3215808.75 -As we can see, the algorithm has reduced 140 peaks to 41 deisotoped peaks. It -also has identified a molecule at 974.45 m/z as the most intense peak in the -data (basepeak). +As we can see, the algorithm has reduced :math:`140` peaks to :math:`41` deisotoped peaks. It +also has identified a molecule at :math:`974.45\ m/z` as the most intense peak in the +data (base peak). Visualization ************* The reason we see two peaks very close together becomes apparent -once we look at the data in TOPPView which indicates that the 974.4572680576728 -peak is derived from a 2+ peak at m/z 487.73 and the peak at 974.4589691256419 -is derived from a 3+ peak at m/z 325.49: the algorithm has identified a single +once we look at the data in :term:`TOPPView` which indicates that the :math:`974.4572680576728` +peak is derived from a :chem:`2+` peak at m/z :math:`487.73` and the peak at :math:`974.4589691256419` +is derived from a :chem:`3+` peak at m/z :math:`325.49`: the algorithm has identified a single analyte in two charge states and deconvoluted the peaks to their nominal mass -of a ``[M+H]+`` ion, which produces two peaks very close together (2+ and 3+ +of a :chem:`[M +H]+` ion, which produces two peaks very close together (:chem:`2+` and :chem:`3+` peak): .. image:: img/deisotoped_zoom.png -Looking at the full spectrum and comparing it to the original spectrum, we can see the -original (centroided) spectrum on the top and the deisotoped spectrum on the -bottom in blue. Note how hovering over a peak in the deisotoped spectrum +Looking at the full mass spectrum and comparing it to the original mass spectrum, we can see the +original (centroided) mass spectrum on the top and the deisotoped mass spectrum on the +bottom in blue. Note how hovering over a peak in the deisotoped mass spectrum indicates the charge state: .. image:: img/deisotoped.png -In the next section, we will look at 2-dimensional deisotoping where instead of -a single spectrum, multiple spectra from a :term:`LC-MS` experiments are analyzed +In the next section (`Feature Detection `_), we will look at 2-dimensional deisotoping where instead of +a single mass spectrum, multiple mass spectra from a :term:`LC-MS` experiment are analyzed together. There algorithms analyze the full 2-dimensional (m/z and RT) signal and are generally more powerful than the 1-dimensional algorithm discussed here. However, not all data is 2 dimensional and the algorithm discussed here has many application in practice (e.g. single mass spectra, fragment ion -spectra in DDA etc.). +mass spectra in DDA etc.). diff --git a/docs/source/chemistry.rst b/docs/source/chemistry.rst index 42d409f7d..da8a4645c 100644 --- a/docs/source/chemistry.rst +++ b/docs/source/chemistry.rst @@ -25,7 +25,7 @@ Elements -------- In OpenMS, elements are stored in :py:class:`~.ElementDB` which has entries for dozens of -elements commonly used in :term:`mass spectrometry`. +elements commonly used in mass spectrometry. .. code-block:: python :linenos: @@ -54,15 +54,15 @@ elements commonly used in :term:`mass spectrometry`. print("One mole of 16O2 weighs", 2 * oxygen.getMonoWeight(), "grams") As we can see, the OpenMS :py:class:`~.ElementDB` has entries for common elements like -Oxygen and Sulfur as well as information on their average and monoisotopic +oxygen and sulfur as well as information on their average and monoisotopic weight. Note that the monoisotopic weight is the weight of the most abundant isotope while the average weight is the sum across all isotopes, weighted by -their natural abundance. Therefore, one mole of oxygen (O2) weighs slightly +their natural abundance. Therefore, one mole of oxygen (:chem:`O2`) weighs slightly more than a mole of only its monoisotopic isotope since natural oxygen is a mixture of multiple isotopes. .. code-block:: output - + Oxygen O 15.994915 @@ -80,6 +80,7 @@ Isotopes We can also inspect the full isotopic distribution of oxygen and sulfur: .. code-block:: python + :linenos: edb = ElementDB() oxygen_isoDist = {"mass": [], "abundance": []} @@ -112,21 +113,21 @@ We can also inspect the full isotopic distribution of oxygen and sulfur: sulfur_isoDist["abundance"].append((iso.getIntensity() * 100)) OpenMS can compute isotopic distributions for individual elements which contain -information for all stable elements. The current values in the file are +information for all stable elements. The current values in the file are average abundances found in nature, which may differ depending on location. The above code outputs the isotopes of oxygen and sulfur as well as their abundance: .. code-block:: output - Oxygen isotope 15.994915 has abundance 99.75699782371521 % - Oxygen isotope 16.999132 has abundance 0.03800000122282654 % - Oxygen isotope 17.999169 has abundance 0.20500000100582838 % + Oxygen isotope 15.994915 has abundance 99.75699782371521 % + Oxygen isotope 16.999132 has abundance 0.03800000122282654 % + Oxygen isotope 17.999169 has abundance 0.20500000100582838 % - Sulfur isotope 31.97207073 has abundance 94.92999911308289 % - Sulfur isotope 32.971458 has abundance 0.7600000128149986 % - Sulfur isotope 33.967867 has abundance 4.2899999767541885 % - Sulfur isotope 35.967081 has abundance 0.019999999494757503 % + Sulfur isotope 31.97207073 has abundance 94.92999911308289 % + Sulfur isotope 32.971458 has abundance 0.7600000128149986 % + Sulfur isotope 33.967867 has abundance 4.2899999767541885 % + Sulfur isotope 35.967081 has abundance 0.019999999494757503 % The isotope distribution of oxygen and sulfur can be displayed with the following extra code: @@ -224,8 +225,8 @@ Mass Defect nucleus, this leads to different observed masses due to the `mass defect `_, which describes the difference between the mass of an atom and the mass of - its constituent particles. For example, the mass difference between 12C and - 13C is slightly different than the mass difference between 14N and 15N, even + its constituent particles. For example, the mass difference between :chem:`^{12}C` and + :chem:`^{13}C` is slightly different than the mass difference between :chem:`^{14}N` and :chem:`^{15}N`, even though both only differ by a neutron from their monoisotopic element: .. code-block:: python @@ -252,12 +253,12 @@ Mass Defect Mass difference between 14N and 15N: 0.997035 Relative deviation: 0.6298867300208343 % - This difference can actually be measured by a high resolution mass - spectrometric instrument and is used in the `tandem mass tag (TMT) - `_ labelling strategy. +This difference can actually be measured by a high resolution mass spectrometry +instrument and is used in the `tandem mass tag (TMT) `_ +labelling strategy. - For the same reason, the helium atom has a slightly lower mass than the mass - of its constituent particles (two protons, two neutrons and two electrons): +For the same reason, the helium atom has a slightly lower mass than the mass +of its constituent particles (two protons, two neutrons and two electrons): .. code-block:: python @@ -283,15 +284,15 @@ Mass Defect Difference between the two masses: 0.7532065888743016 % The difference in mass is the energy released when the atom was formed (or - in other words, it is the energy required to dissassemble the nucleus into + in other words, it is the energy required to disassemble the nucleus into its particles). -Molecular Formulae +Molecular Formulas ------------------ Elements can be combined to molecular formulas (:py:class:`~.EmpiricalFormula`) which can be used to describe molecules such as metabolites, amino acid sequences or -oligonucleotides. The class supports a large number of operations like +oligonucleotides. The class supports a large number of operations like addition and subtraction. A simple example is given in the next few lines of code. @@ -316,15 +317,15 @@ which produces Note how in line 5 we were able to make a new molecule by adding existing molecules (for example by adding two :py:class:`~.EmpiricalFormula` objects). In this -case, we illustrated how to make ethanol by adding a ``CH2`` methyl group to an -existing methanol molecule. Note that OpenMS describes sum formulae with the +case, we illustrated how to make ethanol by adding a :chem:`CH2` methyl group to an +existing methanol molecule. Note that OpenMS describes sum formulas with the :py:class:`~.EmpiricalFormula` object and does store structural information in this class. Isotopes ~~~~~~~~ Specific isotopes can be incorporated into a molecular formula using bracket -notation. For example, ethanol with one or two C13 can be specified using ``(13)C`` as follows: +notation. For example, ethanol with one or two :chem:`C13` can be specified using :chem:`(13)C` as follows: .. code-block:: python :linenos: @@ -365,8 +366,8 @@ Isotopic Distributions OpenMS can also generate theoretical isotopic distributions from analytes represented as :py:class:`~.EmpiricalFormula`. Currently there are two algorithms -implemented, CoarseIsotopePatternGenerator which produces unit mass isotope -patterns and FineIsotopePatternGenerator which is based on the IsoSpec +implemented, :py:class:`~.CoarseIsotopePatternGenerator` which produces unit mass isotope +patterns and :py:class:`~.FineIsotopePatternGenerator` which is based on the IsoSpec algorithm [1]_ : .. code-block:: python @@ -417,7 +418,7 @@ which produces Isotope 47.0481419395 has abundance 0.06732848123647273 % Isotope 48.046119191399995 has abundance 0.20049810409545898 % -Together with the plotDistribution() function from above and the extra code: +Together with the ``plotDistribution()`` function from above and the extra code: .. code-block:: python :linenos: @@ -453,16 +454,16 @@ Please refer to our previous discussion on the `mass defect <#Mass-Defect>`__ to results of the hyperfine algorithm and why different elements produce slightly different masses. In this example, the hyperfine isotopic distribution will -contain two peaks for the nominal mass of 47: one at ``47.045`` for the -incorporation of one heavy 13C with a delta mass of ``1.003355`` and one at ``47.048`` -for the incorporation of one heavy deuterium with a delta mass of ``1.006277``. +contain two peaks for the nominal mass of :math:`47`: one at :math:`47.045` for the +incorporation of one heavy :math:`13C` with a delta mass of :math:`1.003355` and one at :math:`47.048` +for the incorporation of one heavy deuterium with a delta mass of :math:`1.006277`. These two peaks also have two different abundances (the heavy carbon one has -2.1% abundance and the deuterium one has 0.07% abundance). This can be understood given that -there are 2 carbon atoms and the natural abundance of 13C is about -1.1%, while the molecule has six hydrogen atoms and the natural abundance of -deuterium is about 0.02%. The fine isotopic generator will not generate the -peak at nominal mass 49 since we specified our cutoff at 0.1% total abundance -and the four peaks above cover 99.9% of the +:math:`2.1%` abundance and the deuterium one has :math:`0.07%` abundance). This can be understood given that +there are 2 :chem:`C` atoms and the natural abundance of :chem:`13C` is about +:math:`1.1%`, while the molecule has :chem:`6H` atoms and the natural abundance of +deuterium is about :math:`0.02%`. The fine isotopic generator will not generate the +peak at nominal mass :math:`49` since we specified our cutoff at :math:`0.1%` total abundance +and the four peaks above cover :math:`99.9%` of the isotopic abundance. We can also decrease our cutoff and ask for more isotopes to be calculated: @@ -485,29 +486,29 @@ which produces .. code-block:: output - Fine Isotope Distribution: - This covers 0.9999993089130612 probability - Isotope 46.0418651914 has abundance 97.5662887096405 % - Isotope 47.0452201914 has abundance 2.110501006245613 % - Isotope 47.046082191400004 has abundance 0.03716550418175757 % - Isotope 47.0481419395 has abundance 0.06732848123647273 % - Isotope 48.046119191399995 has abundance 0.20049810409545898 % - Isotope 48.0485751914 has abundance 0.011413302854634821 % - Isotope 48.0494371914 has abundance 0.0008039440217544325 % - Isotope 48.0514969395 has abundance 0.0014564131561201066 % - Isotope 49.049474191399995 has abundance 0.004337066275184043 % - Isotope 49.0523959395 has abundance 0.00013835959862262825 % + Fine Isotope Distribution: + This covers 0.9999993089130612 probability + Isotope 46.0418651914 has abundance 97.5662887096405 % + Isotope 47.0452201914 has abundance 2.110501006245613 % + Isotope 47.046082191400004 has abundance 0.03716550418175757 % + Isotope 47.0481419395 has abundance 0.06732848123647273 % + Isotope 48.046119191399995 has abundance 0.20049810409545898 % + Isotope 48.0485751914 has abundance 0.011413302854634821 % + Isotope 48.0494371914 has abundance 0.0008039440217544325 % + Isotope 48.0514969395 has abundance 0.0014564131561201066 % + Isotope 49.049474191399995 has abundance 0.004337066275184043 % + Isotope 49.0523959395 has abundance 0.00013835959862262825 % Here we can observe more peaks and now also see the heavy oxygen peak at -``47.04608`` with a delta mass of ``1.004217`` (difference between 16O and 17O) at an -abundance of 0.04%, which is what we would expect for a single oxygen atom. -Even though the natural abundance of deuterium (0.02%) is lower than 17O -(0.04%), since there are six hydrogen atoms in the molecule and only one -oxygen, it is more likely that we will see a deuterium peak than a heavy oxygen +:math:`47.04608` with a delta mass of :math:`1.004217` (difference between :math:`16O` and :math:`17O`) at an +abundance of :math:`0.04%`, which is what we would expect for a single :chem:`O` atom. +Even though the natural abundance of deuterium (:math:`0.02%`) is lower than :math:`17O` +(:math:`0.04%`), since there are :chem:`6H` atoms in the molecule and only one +:chem:`O`, it is more likely that we will see a deuterium peak than a heavy oxygen peak. Also, even for a small molecule like ethanol, the differences in mass -between the hyperfine peaks can reach more than 110 ppm (48.046 vs 48.051). -Note that the FineIsotopePatternGenerator will generate peaks until the total -error has decreased to 1e-6, allowing us to cover 0.999999 of the probability. +between the hyperfine peaks can reach more than :math:`110` ppm (:math:`48.046` vs :math:`48.051`). +Note that the :py:class:`~.FineIsotopePatternGenerator` will generate peaks until the total +error has decreased to :math:`1e^{-6}`, allowing us to cover :math:`0.999999` of the probability. OpenMS can also produce isotopic distribution with masses rounded to the nearest integer: @@ -625,7 +626,7 @@ Ribonucleotides A `ribonucleotide `_ describes one of the building blocks of DNA and RNA. In OpenMS, a ribonucleotide in its modified or unmodified form is represented by the :py:class:`~.Ribonucleotide` class in -OpenMS. The class is able to provide information such as the isotope +OpenMS. The class is able to provide information such as the isotope distribution of the residue, the average and monoisotopic weight. The residues can be identified by their full name, their three letter abbreviation or the single letter abbreviation. Modified ribonucleotides are represented by the @@ -655,7 +656,7 @@ same class. Currently, support for RNA is implemented. False '1-methyladenosine' True - + .. We could also showcase the "get alternatives" method .. for alt in RibonucleotideDB().getRibonucleotideAlternatives(b"mmA?"): print(alt.getName()) diff --git a/docs/source/chromatographic_analysis.rst b/docs/source/chromatographic_analysis.rst index 8c33d8502..153e0d3e0 100644 --- a/docs/source/chromatographic_analysis.rst +++ b/docs/source/chromatographic_analysis.rst @@ -1,7 +1,7 @@ -Chromagraphic Analysis -====================== +Chromatographic Analysis +======================== -In targeted proteomics, such as SRM / MRM / PRM / DIA applications, groups of +In targeted proteomics, such as :term:`SRM` / MRM / PRM / DIA applications, groups of chromatograms need to be analyzed frequently. OpenMS provides several powerful tools for analysis of chromatograms. Most of them are part of the OpenSWATH suite of tools and are also discussed in the `OpenSwath documentation @@ -10,9 +10,9 @@ suite of tools and are also discussed in the `OpenSwath documentation Peak Detection ************** -Here, we will focus on a simple example where 2 peptides are analyzed. We will +Here, we will focus on a simple example where two peptides are analyzed. We will need 2 input files: the chromatogram files that contains the chromatographic -raw data (raw SRM traces or extracted ion chromatograms from PRM/DIA data) as +raw data (raw :term:`SRM` traces or extracted ion chromatograms from PRM/DIA data) as well as the library file used to generated the data which contains information about the targeted peptides: @@ -69,15 +69,15 @@ about the targeted peptides: ) Here we see that for the first group of transitions (``tr_gr1``), a single peak -at retention time 3119 seconds was found. However, for the second group of -transitions, two peaks are found at retention times 3119 seconds and at -3055 seconds. +at retention time :math:`3119\ seconds` was found. However, for the second group of +transitions, two peaks are found at retention times :math:`3119\ seconds` and at +:math:`3055\ seconds`. Visualization ************* We can confirm the above analysis by visual inspection of the ``chrom.mzML`` -file produced above in the TOPPView software: +file produced above in the :term:`TOPPView` software: .. image:: img/chroms.png @@ -92,14 +92,14 @@ However, our output above contains more information than only retention time: Feature for group tr_gr2 with precursor m/z 501.0 Feature found at RT = 3119.0630105310684 with library dot product 0.7501676755451506 -Based on the output above, we can infer that the peak at 3055 seconds is -likely the correct peak for ``tr_gr2`` since it has a high library dot product -(0.95) while the peak at 3119 seconds is likely incorrect for ``tr_gr2`` since -its dot product is low (0.75). We also see that a peak at 3119 seconds is +Based on the output above, we can infer that the peak at :math:`3055\ seconds` is +likely the correct peak for ``tr_gr2`` since it has a high library dot product +(:math:`0.95`) while the peak at :math:`3119\ seconds` is likely incorrect for ``tr_gr2`` since +its dot product is low (:math:`0.75`). We also see that a peak at :math:`3119\ seconds` is likely correct for ``tr_gr1`` since it matches well with the expected library -intensities and has a high dot product (0.99). +intensities and has a high dot product (:math:`0.99`). -Note: to get an overview over all available scores for a particular MRM feature ``f``, you can use +Note: to get an overview over all available scores for a particular MRM features ``f``, you can use .. code-block:: python diff --git a/docs/source/conf.py b/docs/source/conf.py index 396c58600..601ce290c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,13 +24,13 @@ import contextlib import sys +sys.path.append(os.path.abspath("./_ext")) + # import sys # sys.path.insert(0, os.path.abspath('.')) from platform import python_version_tuple from sys import platform -sys.path.append(os.path.abspath("./_ext")) - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -42,16 +42,16 @@ # ones. extensions = [ 'glossary_warnings', 'hoverxref.extension', - 'sphinx_copybutton', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', + 'sphinx_copybutton', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.mathjax', 'chemrole' ] autosummary_generate = True autosummary_imported_members = True autodoc_docstring_signature = True -#configure tooltips -hoverxref_roles = ['term',] -hoverxref_role_types = {'term':'tooltip',} -#specific for pyopenms documentation +# configure tooltips +hoverxref_roles = ['term', ] +hoverxref_role_types = {'term': 'tooltip', } +# specific for pyopenms documentation hoverxref_tooltip_lazy = True # Add any paths that contain templates here, relative to this directory. @@ -60,7 +60,7 @@ # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -# source_suffix = ['.rst', '.md'] +# source_suffix = ['.rst ', '.md'] source_suffix = '.rst' # The master toctree document. diff --git a/docs/source/digestion.rst b/docs/source/digestion.rst index 9c3eea1a8..392ce5aca 100644 --- a/docs/source/digestion.rst +++ b/docs/source/digestion.rst @@ -25,7 +25,7 @@ OpenMS has classes for proteolytic digestion which can be used as follows: len(result) # 82 peptides Very short peptides or even single amino acid digestion products are often discarded as they usually contain little information (e.g., can't be used to identify proteins). -We now only generate digestion products with a length of 7 to 40. +We now only generate digestion products with a length of :math:`7` to :math:`40`. .. code-block:: python @@ -52,7 +52,7 @@ Proteolytic Digestion with Lys-C ******************************** We can of course also use different enzymes, these are defined in the ``Enzymes.xml`` -file and can be accessed using the ``EnzymesDB`` object +file and can be accessed using the :py:class:`~.EnzymesDB` object .. code-block:: python @@ -83,14 +83,14 @@ cut our protein of interest: print(result[4].toString()) len(result) # 57 peptides -We now get different digested peptides (57 vs 82) and the fourth peptide is now +We now get different digested peptides (:math:`57` vs :math:`82`) and the fourth peptide is now ``GLVLIAFSQYLQQCPFDEHVK`` instead of ``DTHK`` as with Trypsin (see above). Oligonucleotide Digestion ************************** There are multiple cleavage enzymes available for oligonucleotides, these are defined ``Enzymes_RNA.xml`` -file and can be accessed using the ``RNaseDB`` object +file and can be accessed using the :py:class:`~.RNaseDB` object .. code-block:: python diff --git a/docs/source/GNPS_export.rst b/docs/source/export_files_GNPS.rst similarity index 77% rename from docs/source/GNPS_export.rst rename to docs/source/export_files_GNPS.rst index 4a9f365f2..c5c13f02f 100644 --- a/docs/source/GNPS_export.rst +++ b/docs/source/export_files_GNPS.rst @@ -1,16 +1,16 @@ -Export files for GNPS +Export Files for GNPS ===================== -With pyOpenMS you can automatically generate all files needed for GNPS Feature-Based Molecular Networking (FBMN) and +With pyOpenMS you can automatically generate all files needed for GNPS Feature-Based Molecular Networking (FBMN) and Ion Identity Molecular Networking (IIMN). -Pre-requisites are your input mzML files and a :py:class:`~.ConsensusMap`, generated by an +Pre-requisites are your input :term:`mzML` files and a :py:class:`~.ConsensusMap`, generated by an `untargeted metabolomics pre-processing workflow `_. -Ensure that MS2 data has been mapped to the :py:class:`~.FeatureMap` objects with :py:class:`~.IDMapper`. +Ensure that :term:`MS2` data has been mapped to the :py:class:`~.FeatureMap` objects with :py:class:`~.IDMapper`. For IIMN adduct detection must have been performed on the :py:class:`~.FeatureMap` objects during pre-processing with :py:class:`~.MetaboliteFeatureDeconvolution`. -First, download two example ``mzML`` files that have been map aligned based on a feature map alignment. +First, download two example :term:`mzML` files that have been map aligned based on a :term:`feature map` alignment. .. code-block:: python @@ -37,8 +37,8 @@ First, download two example ``mzML`` files that have been map aligned based on a consensusXML_file = "UntargetedMetabolomics.consensusXML" -Since GNPS only works with features that contain MS2 fragmentation spectra, the first step is to filter out features -from your :py:class:`~.ConsensusMap` that have no MS2 spectra annotated. +Since GNPS only works with features that contain :term:`MS2` fragmentation spectra, the first step is to filter out features +from your :py:class:`~.ConsensusMap` that have no :term:`MS2` spectra annotated. .. code-block:: python diff --git a/docs/source/pandas_df_conversion.rst b/docs/source/export_pandas_dataframe.rst similarity index 95% rename from docs/source/pandas_df_conversion.rst rename to docs/source/export_pandas_dataframe.rst index fbbfef614..9519f3e32 100644 --- a/docs/source/pandas_df_conversion.rst +++ b/docs/source/export_pandas_dataframe.rst @@ -19,11 +19,11 @@ Required imports for the examples: url = "https://raw.githubusercontent.com/OpenMS/pyopenms-docs/master/src/data/" -MSExperiment -************ +:py:class:`~.MSExperiment` +************************** **pyopenms.MSExperiment.get_df(** *long=False* **)** - Generates a pandas DataFrame with all peaks in the MSExperiment + Generates a pandas DataFrame with all peaks in the MSExperiment **Parameters:** @@ -72,11 +72,11 @@ MSExperiment "0", "1501.41394", "300.089752", "3431.026123" "1", "1501.41394", "300.181335", "1181.808960" -PeptideIdentifications -********************** +:py:class:`~.PeptideIdentification` +*********************************** **pyopenms.peptide_identifications_to_df( peps**, *decode_ontology=True*, *default_missing_values={bool: False, int: -9999, float: np.nan, str: ''}*, *export_unidentified=True* **)** - Generates a pandas DataFrame with all peaks in the MSExperiment + Generates a pandas DataFrame with all peaks in the MSExperiment **Parameters:** @@ -121,8 +121,8 @@ PeptideIdentifications "0", "OpenNuXL_2019-12-04T16:39:43_1021782429466859437", "900.425415", "414.730865", "0.368649", "4", "DECOY_sp|Q86UQ0|ZN589_HUMAN", "255", "267", "828.458069", "552.641113", "...", "0", "1654.901611", "0", "0.173912" "1", "OpenNuXL_2019-12-04T16:39:43_7293634134684008928", "903.565186", "506.259521", "0.422779", "2", "sp|P61313|RL15_HUMAN", "179", "187", "0.0", "0.0", "...", "0", "1010.504639", "0", "0.290786" -FeatureMap -********** +:py:class:`~.FeatureMap` +************************ **pyopenms.FeatureMap.get_df(** *meta_values = None* **)** Generates a pandas DataFrame with information contained in the FeatureMap. @@ -198,7 +198,7 @@ FeatureMap **Extract assigned peptide identifications from a feature map** -Peptide identifications can be mapped to their corresponding features in a ``FeatureMap``. It is possible to extract them using the function +Peptide identifications can be mapped to their corresponding features in a ``FeatureMap``. It is possible to extract them using the function ``pyopenms.FeatureMap.get_assigned_peptide_identifications()`` returning a list of ``PeptideIdentification`` objects. @@ -219,7 +219,7 @@ Peptide identifications can be mapped to their corresponding features in a ``Fea A ``DataFrame`` can be created on the resulting list of :py:class:`~.PeptideIdentification` objects using ``pyopenms.peptide_identifications_to_df(assigned_peptides)``. -Feature map and peptide data frames contain columns, on which they can be merged together to contain the complete +:term:`Feature map` and peptide data frames contain columns, on which they can be merged together to contain the complete information for peptides and features in a single data frame. The columns for unambiguously merging the data frames: @@ -233,6 +233,7 @@ The columns for unambiguously merging the data frames: **Example:** .. code-block:: python + :linenos: feature_df = feature_map.get_df() assigned_peptides = feature_map.get_assigned_peptide_identifications() @@ -252,8 +253,8 @@ The columns for unambiguously merging the data frames: "9650885788371886430", "LVTDLTK", "0.000000", "unknown", "spectrum=1270", "2", "1942.600083", "395.239277", "1932.484009", "1950.834351", "...", "OMSSA_2009-11-17T11:11:11_4731105163044641872", "1933.405151", "395.239349", "0.000000", "2", "P02769|ALBU_BOVIN", "-1", "-1", "0.001084", "True" "18416216708636999474", "DDSPDLPK", "0.034483", "unknown", "spectrum=1167", "2", "1749.138335", "443.711224", "1735.693115", "1763.343506", "...", "OMSSA_2009-11-17T11:11:11_4731105163044641872", "1738.033447", "443.711243", "0.034483", "2", "P02769|ALBU_BOVIN", "-1", "-1", "0.003951", "True" -ConsensusMap -************ +:py:class:`~.ConsensusMap` +************************** **pyopenms.ConsensusMap.get_df()** Generates a pandas DataFrame with both consensus feature meta data and intensities from each sample. @@ -262,7 +263,7 @@ ConsensusMap **pandas.DataFrame** - consensus map meta data and intensity stored in pandas DataFrame + :term:`consensus map` meta data and intensity stored in pandas DataFrame **pyopenms.ConsensusMap.get_intensity_df()** Generates a pandas DataFrame with feature intensities from each sample in long format (over files). @@ -308,6 +309,7 @@ ConsensusMap "10409195546240342212", "SHC(Carbamidomethyl)IAEVEK", "3", "1552.032973", "358.174576", "0.491247", "1358151.0", "...", "0.0" .. code-block:: python + :linenos: df = consensus_map.get_intensity_df() df.head(2) diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 298c1b692..b8028c509 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -4,18 +4,18 @@ Frequently Asked Questions .. contents:: The following is a list of common questions asked about pyOpenMS. How can I wrap a new method with pyOpenMS? ------------------------------------------- +************************************************** Add an entry to ``src/pyOpenMS/pxds/CLASS_NAME.pxd`` with the signature of your new method(s). How can I wrap a new class with pyOpenMS? ------------------------------------------ +************************************************* Create a new file ``src/pyOpenMS/pxds/CLASS_NAME.pxd`` and use the `procedure outlined `_. Can I use multiple output parameters? -------------------------------------- +************************************* Python does not support passing primitive types (``int``, ``double``, etc.) by reference, therefore ``void calculate(double &)`` will not work. diff --git a/docs/source/feature_detection.rst b/docs/source/feature_detection.rst index e5f94f58c..f7cda16ec 100644 --- a/docs/source/feature_detection.rst +++ b/docs/source/feature_detection.rst @@ -1,16 +1,16 @@ Feature Detection ================= -One very common task in :term:`mass spectrometry` is the detection of 2-dimensional -patterns in m/z and time (RT) dimension from a series of MS1 scans. These +One very common task in mass spectrometry is the detection of 2-dimensional +patterns in m/z and time (RT) dimension from a series of :term:`MS1` scans. These patterns are called ``Features`` and they exhibit a chromatographic elution -profile in the time dimension and an isotopic pattern in the m/z dimension (see +profile in the time dimension and an isotopic pattern in the m/z dimension (see `previous section `_ for the 1-dimensional problem). OpenMS has multiple tools that can identify these features in 2-dimensional data, these tools are called :py:class:`~.FeatureFinder`. Currently the following FeatureFinders are available in pyOpenMS: - - :py:class:`~.FeatureFinderMultiplexAlgorithm` (e.g., SILAC, Dimethyl labeling, (and label-free), identification free feature detection of peptides) + - :py:class:`~.FeatureFinderMultiplexAlgorithm` (e.g., :term:`SILAC`, Dimethyl labeling, (and label-free), identification free feature detection of peptides) - :py:class:`~.FeatureFinderAlgorithmPicked` (Label-free, identification free feature detection of peptides) - :py:class:`~.FeatureFinderIdentificationAlgorithm` (Label-free identification-guided feature detection of peptides) - :py:class:`~.FeatureFinderAlgorithmIsotopeWavelet` (old instruments) @@ -20,10 +20,10 @@ FeatureFinders are available in pyOpenMS: All of the algorithms above are for proteomics data with the exception of :py:class:`~.FeatureFindingMetabo` and :py:class:`~.FeatureFinderMetaboIdentCompound` for metabolomics data and small molecules in general. Proteomics -********** +****************************** -Two of the most commonly used FeatureFinders for proteomics in OpenMS are the :py:class:`~.FeatureFinder` and :py:class:`~.FeatureFinderIdentificationAlgorithm` which both work on (high -resolution) centroided data. We can use the following code to find ``Features`` in MS data: +Two of the most commonly used feature finders for proteomics in OpenMS are the :py:class:`~.FeatureFinder` and :py:class:`~.FeatureFinderIdentificationAlgorithm` which both work on (high +resolution) centroided data. We can use the following code to find features in MS data: .. code-block:: python @@ -78,14 +78,14 @@ Python as follows: Each entry in the :py:class:`~.FeatureMap` is a so-called :py:class:`~.Feature` and allows direct -access to the `m/z` and `RT` value from Python. Again, we can lear this by -inspecting ``help(f)`` or by consulting the Manual. +access to the m/z and RT value from Python. Again, we can learn this by +inspecting ``help(f)`` or by consulting the manual. Note: the output file that we have written (``output.featureXML``) is an OpenMS-internal XML format for storing features. You can learn more about file formats in the `Reading MS data formats `_ section. -Metabolomics - untargeted +Metabolomics - Untargeted ************************* For the untargeted detection of small molecule features we can use the :py:class:`~.FeatureFindingMetabo` with prior :py:class:`~.MassTraceDetection` and :py:class:`~.ElutionPeakDetection`. @@ -145,21 +145,21 @@ For the untargeted detection of small molecule features we can use the :py:class fm.setUniqueIds() fm.setPrimaryMSRunPath(["ms_data.mzML".encode()]) -Metabolomics - targeted +Metabolomics - Targeted *********************** -``FeatureFinderAlgorithmMetaboIdent`` performs MS1-based **targeted feature extraction** based on user provided compounds, which are -specified in an assay library (a tab-separated text file). Detected ``Features`` are stored in a :py:class:`~.FeatureMap` which can be -stored in a :py:class:`~.FeatureXMLFile`. This tool is useful for the targeted extraction of ``Features`` for a well-defined set of compounds -with known sum formulas and retention times. +:py:class:`~.FeatureFinderAlgorithmMetaboIdent` performs :term:`MS1`-based **targeted feature extraction** based on user provided compounds, which are +specified in an assay library (a tab-separated text file). Detected features are stored in a :py:class:`~.FeatureMap` which can be +stored in a :py:class:`~.FeatureXMLFile`. This tool is useful for the targeted extraction of features for a well-defined set of compounds +with known sum formulas and retention times. For more information on the format of the assay library and available parameters visit the `FeatureFinderMetaboIdent documentation `_. The pyOpenMS :py:class:`~.FeatureFinderAlgorithmMetaboIdent` needs a list of :py:class:`~.FeatureFinderMetaboIdentCompound` objects as an assay libray for it's -``run`` function. We could create that list ourselves or use the following function to read an assay library as ``.tsv`` file: +:py:meth:`~.FeatureFinderAlgorithmMetaboIdent.run()` function. We could create that list ourselves or use the following function to read an assay library as ``.tsv`` file: -.. csv-table:: Coupounds tsv file +.. csv-table:: Compounds tsv file :widths: 50 30 15 15 15 15 15 :header: "CompoundName", "SumFormula", "Mass", "Charge", "RetentionTime", "RetentionTimeRange", "IsoDistribution" @@ -247,11 +247,11 @@ formats in the `Reading MS data formats `_ section. We can get a quick overview on the detected features by plotting them using the following function: .. code-block:: python + :linenos: - import matplotlib.pyplot as plt + import matplotlib.pyplot as plt - - def plotDetectedFeatures3D(path_to_featureXML): + def plotDetectedFeatures3D(path_to_featureXML): fm = FeatureMap() fh = FeatureXMLFile() fh.load(path_to_featureXML, fm) diff --git a/docs/source/feature_linking.rst b/docs/source/feature_linking.rst index f69033ed0..9993536d0 100644 --- a/docs/source/feature_linking.rst +++ b/docs/source/feature_linking.rst @@ -3,7 +3,7 @@ Feature Linking The pyOpenMS feature grouping algorithms group corresponding features (e.g., of same analyte) from multiple :py:class:`~.FeatureMap` objects into a :py:class:`~.ConsensusMap`. Linking is primarily done based on spatial proximity (e.g., similar retention time and m/z). -It is, thus, advisable to perform a map alignment before feature linking. +It is, thus, advisable to perform a map alignment before features linking. Optionally, identification data can be considered to prevent linking of features with different identifications. @@ -12,13 +12,13 @@ Optionally, identification data can be considered to prevent linking of features Different feature grouping algorithms with slightly different implementations are runtime characteristics are available in pyOpenMS: -- FeatureGroupingAlgorithmQT -- FeatureGroupingAlgorithmKD -- FeatureGroupingAlgorithm -- FeatureGroupingAlgorithmLabeled -- FeatureGroupingAlgorithmUnlabeled +- :py:class:`~.FeatureGroupingAlgorithmQT` +- :py:class:`~.FeatureGroupingAlgorithmKD` +- :py:class:`~.FeatureGroupingAlgorithm` +- :py:class:`~.FeatureGroupingAlgorithmLabeled` +- :py:class:`~.FeatureGroupingAlgorithmUnlabeled` -We now perform a feature linking using the FeatureGroupingAlgorithmQT algorithm. +We now perform a features linking using the :py:class:`~.FeatureGroupingAlgorithmQT` algorithm. Download Example Data ********************* @@ -47,8 +47,8 @@ Download Example Data FeatureXMLFile().load(feature_file, feature_map) feature_maps.append(feature_map) -Feature Linking Algorithm -************************* +features Linking Algorithm +****************************************** All :py:class:`~.FeatureMap` objects will be combined in a :py:class:`~.ConsensusMap`. diff --git a/docs/source/first_steps.rst b/docs/source/first_steps.rst index a25d2556c..1870a1318 100644 --- a/docs/source/first_steps.rst +++ b/docs/source/first_steps.rst @@ -1,14 +1,14 @@ -Import pyopenms +Import pyOpenMS =============== -After installation, you should be able to import pyopenms as a package +After installation, you should be able to import pyOpenMS as a package .. code-block:: python import pyopenms -which should now give you access to all of pyopenms. You should now be able to -interact with the OpenMS library and, for example, read and write mzML files: +which should now give you access to all of pyOpenMS. You should now be able to +interact with the OpenMS library and, for example, read and write :term:`mzML` files: .. code-block:: python @@ -17,13 +17,13 @@ interact with the OpenMS library and, for example, read and write mzML files: exp = MSExperiment() MzMLFile().store("testfile.mzML", exp) -which will create an empty mzML file called `testfile.mzML`. +which will create an empty :term:`mzML` file called `testfile.mzML`. -Getting help -============ +Using the Help Function +======================= There are multiple ways to get information about the available functions and -methods. We can inspect individual pyOpenMS objects through the ``help`` +methods. We can inspect individual pyOpenMS objects through the ``Python`` ``help`` function: .. code-block:: python @@ -65,7 +65,7 @@ section which points to additional classes that act as base classes to The list of available methods is long (but does *not* include methods from the base classes) and reveals that the class exposes methods such as :py:meth:`~.MSExperiment.getNrSpectra` and :py:meth:`~.MSExperiment.getSpectrum(id)` where the argument ``id`` indicates -the spectrum identifer. The command also lists the signature for each +the spectrum identifier. The command also lists the signature for each function, allowing users to identify the function arguments and return types. We can gain further information about exposed methods by investigating the documentation of the base classes: @@ -95,14 +95,14 @@ wrapped methods, please consult the official OpenMS documentation, in this case the `MSExperiment documentation `_. -First look at data +First Look at Data ================== -File reading +File Reading ************ pyOpenMS supports a variety of different files through the implementations in -OpenMS. In order to read mass spectrometric data, we can download the `mzML` +OpenMS. In order to read mass spectrometric data, we can download the :term:`mzML` example file: .. code-block:: python @@ -116,7 +116,7 @@ example file: # load example file MzMLFile().load("tiny.mzML", exp) -which will load the content of the "tiny.mzML" file into the ``exp`` +which will load the content of the ``tiny.mzML`` file into the ``exp`` variable of type :py:class:`~.MSExperiment`. We can now inspect the properties of this object: @@ -190,7 +190,7 @@ This iterates through all available :py:class:`~.MSSpectra`, we can also access MS Level: 2 Note that ``spec[1]`` will access the *second* spectrum (arrays start at -``0``). We can access the raw peaks through :py:meth:`~.MSSpectrum.get_peaks`: +``0``). We can access the raw peaksthrough :py:meth:`~.MSSpectrum.get_peaks()`: .. code-block:: python @@ -201,9 +201,9 @@ Note that ``spec[1]`` will access the *second* spectrum (arrays start at 110 -Which will access the data using a numpy array, storing the *m/z* information -in the ``mz`` vector and the intensity in the ``i`` vector. Alternatively, we -can also iterate over individual peak objects as follows (this tends to be +Which will access the data using a numpy array, storing the m/z information +in the mz vector and the intensity in the ``i`` vector. Alternatively, we +can also iterate over individual peaks objects as follows (this tends to be slower): .. code-block:: python @@ -224,14 +224,14 @@ slower): 4.0 2.0 -Total ion current calculation +Total Ion Current Calculation ***************************** Here, we will apply what we have learned to calculate the total ion current (TIC). The TIC represents the summed intensity across the entire range of masses being detected at every point in the analysis. Basically, we calculate the total ion current of the whole experiment. -With this information, we can write a function that calculates the TIC for a given ms level: +With this information, we can write a function that calculates the TIC for a given MS level: .. code-block:: python @@ -262,23 +262,24 @@ To calculate a TIC we would now call the function: Note how one can compute the same property using list comprehensions in Python (see line number 3 in the above code which computes the TIC using filtering properties of Python list comprehensions (``s.getMSLevel() == 1``) and computes -the sum over all peaks (right ``sum``) and the sum over all spectra (left +the sum over all peaks(right ``sum``) and the sum over all spectra (left ``sum``) to retrieve the TIC). -Total ion current chromatogram -****************************** +Total Ion Current Chromatogram +**************************************************** The total ion current is visualized over the retention time, to allow for the inspection of areas with general high intensity (usually multiple analytes were measured there). This can help the experimentalist to optimize the chromatography for a better -seperation in a specific area. +separation in a specific area. -While some mzML files already contain a pre-computed total ion current chromatogram (TIC), -we will show you how to calculate the TIC for MS1. One can access the retention times and -intensities of the TIC in different ways and generate a total ion current chromatogram +While some :term:`mzML` files already contain a pre-computed total ion current chromatogram (TIC), +we will show you how to calculate the TIC for :term:`MS1`. One can access the retention times +and intensities of the TIC in different ways and generate a total ion current chromatogram (2D graph) using ``matplotlib``: .. code-block:: python + :linenos: import matplotlib.pyplot as plt from urllib.request import urlretrieve diff --git a/docs/source/theoreticalspectrumgenerator.rst b/docs/source/fragment_spectrum_generation.rst similarity index 75% rename from docs/source/theoreticalspectrumgenerator.rst rename to docs/source/fragment_spectrum_generation.rst index d7850bea4..c00cfa3ae 100644 --- a/docs/source/theoreticalspectrumgenerator.rst +++ b/docs/source/fragment_spectrum_generation.rst @@ -1,17 +1,18 @@ -Fragment spectrum generation +Fragment Spectrum Generation ============================ Generating theoretical fragment spectra is central to many identification tasks in computational mass spectrometry. -TheoreticalSpectrumGenerator can be configured to generate tandem MS spectra from +:py:class`~.TheoreticalSpectrumGenerator` can be configured to generate :term:`MS2` spectra from a given peptide charge combination. There are various parameters which influence the generated ions e.g. simulating different fragmentation techniques. -Y-ion spectrum -************** +Y-ion :term:`Mass Spectrum` +****************************************** -First, we will generate a simple spectrum that only contains y-ions +First, we will generate a simple mass spectrum that only contains y-ions .. code-block:: python + :linenos: from pyopenms import * @@ -47,22 +48,27 @@ which produces all y single charged ions: which you could plot with :py:func:`~.plot_spectrum`, automatically showing annotated ions.: .. code-block:: python + :linenos: + import matplotlib.pyplot as plt from pyopenms.plotting import plot_spectrum import matplotlib.pyplot as plt plot_spectrum(spec1) plt.show() + + .. image:: img/DFPIANGER_theo.png -Full fragment ion spectrum -************************** +Full Fragment Ion :term:`Mass Spectrum` +****************************************************** -We can also produce additional peaks in the fragment ion spectrum, such as +We can also produce additional peaks in the fragment ion mass spectrum, such as isotopic peaks, precursor peaks, ions from higher charge states, additional ion series, or common neutral losses: .. code-block:: python + :linenos: spec2 = MSSpectrum() # standard behavior is adding b- and y-ions @@ -111,20 +117,23 @@ peaks), here we will just show the first few peaks: b2-H2O1++ is generated at m/z 123.049673158171 [...] -which you again can visualize with: +which you can again visualize with: .. code-block:: python + :linenos: + import matplotlib.pyplot as plt from pyopenms.plotting import plot_spectrum import matplotlib.pyplot as plt plot_spectrum(spec2, annotate_ions=False) plt.show() + .. image:: img/DFPIANGER_theo_full.png The first example shows how to put peaks of a certain type, y-ions in this case, into -a spectrum. The second spectrum is filled with a complete fragment ion spectrum +a mass spectrum. The second mass spectrum is filled with a complete fragment ion mass spectrum of all peaks (a-, b-, y-ions, precursor peaks, and losses). Here, from the peptide with 9 amino acids, fragments theoretically can occur in 8 @@ -137,16 +146,14 @@ the largest charge states (precursor ion (M+H) and its loss of water ([M+H]-H2O) ammonia ([M+H]-NH3)). To include all precursor ions with possible charge states, the ``add_all_precursor_charges`` parameter should be set to true. -The losses are based on commonly -observed fragment ion losses for specific amino acids and are defined in the -``Residues.xml`` file, which means that not all fragment ions will produce all -possible losses, as can be observed above: water loss is not observed for the -y1 ion but for the y2 ion since glutamic acid can have a neutral water loss but -arginine cannot. Similarly, only water loss and no ammonia loss is simulated in -the ``a/b/c`` ion series with the first fragment capable of ammonia loss being +The losses are based on commonly observed fragment ion losses for specific +amino acids and are defined in the ``Residues.xml`` file, which means that not all +fragment ions will produce all possible losses, as can be observed above: water loss +is not observed for the y1 ion but for the y2 ion since glutamic acid can have a neutral +water loss but arginine cannot. Similarly, only water loss and no ammonia loss is simulated +in the ``a/b/c`` ion series with the first fragment capable of ammonia loss being asparagine at position 6. - The :py:class:`~.TheoreticalSpectrumGenerator` has many parameters which have a detailed description located in the class documentation. Note how the ``add_metainfo`` parameter @@ -156,22 +163,22 @@ iterate over annotated ions and their masses. Visualization ************* -We can now visualize the resulting spectra using TOPPView when we open the -DFPIANGER.mzML file that we produced above in TOPPView: +We can now visualize the resulting spectra using :term:`TOPPView` when we open the +DFPIANGER.mzML file that we produced above in :term:`TOPPView`: .. image:: img/peptide_y_ions.png We can see all eight y ion peaks that are produced in the -:py:class:`~.TheoreticalSpectrumGenerator` and when we hover over one of the peaks (546 mz in +:py:class:`~.TheoreticalSpectrumGenerator` and when we hover over one of the peaks (:math:`546\ mz` in this example) there is an annotation in the bottom left corner that indicates -charge state and ion name (``y5+`` for every peak). The larger spectrum with -146 peaks can also be interactively investigated with TOPPView (the second +charge state and ion name (:chem:`y5+` for every peak). The larger spectrum with +:math:`146` peaks can also be interactively investigated with :term:`TOPPView` (the second spectrum in the file): .. image:: img/peptide_all_ions.png -There are substantially more peaks here and the spectrum is much busier, with -singly and double charged peaks of the b, y and a series creating 44 different +There are substantially more peaks here and the mass spectrum is much busier, with +singly and double charged peaks of the b, y and a series creating :math:`44` different individual fragment ion peaks as well as neutral losses adding an additional -102 peaks (neutral losses easily recognizable by their 10-fold lower intensity +:math:`102` peaks (neutral losses easily recognizable by their :math:`10-fold` lower intensity in the simulated spectrum). diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index 0ebb15892..3251c7bb3 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -6,158 +6,141 @@ A glossary of common terms used throughout OpenMS documentation. .. glossary:: :sorted: - LC-MS - LCMS - Liquid Chromatography-coupled Mass Spectrometry. + peptide-spectrum match + PSM + A method used in proteomics to identify proteins from a complex mixture. Involves comparing the + mass spectra of peptide fragments generated from a protein sample with a database of predicted + spectra, in order to identify the protein that produced the observed peptides. - liquid chromatography - An analytical technique used to separate molecules of interest. + LC-MS + LCMS + :term:`Liquid chromatography`-coupled mass spectrometry. - mass spectrometry - An analytical technique used to identify and quantify molecules of interest. + LC-MS/MS + liquid chromatography coupled :term:`tandem mass spectrometry` + See :term:`LC-MS` and :term:`MS2`. - peptides - A short chain of amino acids. + liquid chromatography + LC + An analytical technique used to separate molecules of interest. - FASTA format + FASTA A text-based format for representing nucleotide or amino acid sequences. - octadecyl (C18) - An alkyl radical C(18)H(37) derived from an octadecane by removal of one hydrogen atom. - - mass - Mass is a measure of the amount of matter that an object contains. In comparison to often used term weight, which is a measure of the force of gravity on that object. - - ion - Any :term:`atom` or group of atoms that bears one or more positive or negative electrical charges. Positively charged are cations, negavtively charged anions. + C18 + octadecyl + Octadecyl (C18) is an alkyl radical C(18)H(37) derived from an octadecane by removal of one hydrogen atom. - electrospray ionization (ESI) - A technique used in :term:`mass spectrometry` to produce ions. + ESI + electrospray ionization + Electrospray ionization (ESI) is a technique used in MS to produce ions. - atom - An atom is the smallest unit of ordinary matter that forms a chemical element. - - aerosol - An aerosol is a suspension of fine solid particles or liquid droplets in air or another gas. + MS2 + MS/MS + tandem mass spectrometry + Tandem MS is a technique where two or more mass analyzers are coupled together using an additional reaction step to increase their abilities to analyse chemical samples. - Time-of-flight (TOF) - A measurement of the time taken by an object, particle of wave (be it acoustic, electromagnetic, e.t.c) to travel a distance through a medium. + TOF + time-of-flight + Time-of-flight (TOF) is the time taken by an object, particle of wave (be it acoustic, electromagnetic, e.t.c) to travel a distance through a medium. - quadrupole mass filters + quadrupole A mass filter allowing one mass channel at a time to reach the detector as the mass range is scanned. - Orbitrap analyzers - In :term:`mass spectrometry`, an ion trap mass analyzer consisting of an outer barrel-like electrode and a coaxial inner + orbitrap + In MS, an ion trap mass analyzer consisting of an outer barrel-like electrode and a coaxial inner spindle-like electrode that traps ions in an orbital motion around the spindle. - A high resolution :term:`mass spectrometry` analyzer. + A high resolution MS analyzer. MS1 - First stage to get a spectra. A sample is injected into the mass spectrometer, ionized, accelerated and analyzed by :term:`mass spectrometry`. - - MS2 - Tandem :term:`mass spectrometry`, also MS/MS, a technique where two or more mass analyzers are coupled together. - The ions from MS1 spectra are selectively fragmented and analyzed by a second stage of :term:`mass spectrometry`. + Mass spectra of a sample from a single fragmentation step. MS3 - Multi-stage Mass Spectrometry + Multi-stage MS - collision-induced dissociation (CID) - A :term:`mass spectrometry` technique to induce fragmentation of selected ions in the gas phase. Also known as Collision - induced dissociation. + CID + collision-induced dissociation + Collision-induced dissociation is a MS technique to induce fragmentation of selected ions in the gas phase. TOPP - The OpenMS Pipeline. + The OpenMS Pipeline is a set of chainable tools to create pipelines for mass spectrometry analysis. MSGFPlusAdapter - Adapter for the MS-GF+ protein identification (database search) engine. More information is available in the `OpenMS API reference documentation `__. + Adapter for the MS-GF+ protein identification (database search) engine. More information is available in the + `OpenMS API reference documentation `__. LuciphorAdapter Adapter for the LuciPHOr2: a site localisation tool of generic post-translational modifications from tandem mass spectrometry data. More information is available in the `OpenMS API reference documentation `__. - pyOpenMS - pyOpenMS is an open-source Python library for :term:`mass spectrometry`, specifically for the analysis of proteomics and - metabolomics data in Python. - TOPP tools - OpenMS provides a number of functions that process :term:`mass spectrometry` data called TOPP tools. All TOPP tools are described in the `OpenMS API reference documentation `__. + OpenMS provides a number of functions that process MS data called :term:`TOPP` tools. All :term:`TOPP` + tools are described in the `OpenMS API reference documentation `__. UTILS - Besides :term:`TOPP tools`, OpenMS offers a range of other tools. They are not included in :term:`TOPP` as they are not part of typical analysis pipelines. More information is present in `OpenMS API reference documentation `__. + Besides :term:`TOPP tools`, OpenMS offers a range of other tools. They are not included in :term:`TOPP` as they + are not part of typical analysis pipelines. More information is present in `OpenMS API reference documentation `__. TOPPView TOPPView is a viewer for MS and HPLC-MS data. - nightly Snapshot + nightly snapshot Untested installers and containers are known as the nightly snapshot. - proteomics - Proteomics is the large-scale study of proteins. - - proteins - Proteins are vital parts of living organisms, with many functions, for example composing the structural fibers of - muscle to the enzymes that catalyze the digestion of food to synthesizing and replicating DNA. - MascotAdapter - Used to identifies peptides in MS/MS spectra. Read more about MascotAdapter in the `OpenMS API reference documentation `__. + Used to identifies peptides in MS2 spectra. Read more about MascotAdapter in the `OpenMS API reference documentation `__. - HPLC-MS - Data produced by High performance liquid chromatography (HPLC) separates components of a mixture, whereas mass - spectrometry (MS) offers the detection tools to identify them. + high performance liquid chromatography + HPLC + In high performance liquid chromatography (HPLC), analytes are dissolved in a pressurized solvent (mobile phase) + and pumped through a solid adsorbent material (stationary phase) packed into a + capillary column. Physicochemical properties of the analyte determine how strongly it + interacts with the stationary phase. mzML + mzml The mzML format is an open, XML-based format for mass spectrometer output files, developed with the full participation of vendors and researchers in order to create a single open format that would be supported by all software. mzData + mzdata mzData was the first attempt by the Proteomics Standards Initiative (PSI) from the Human Proteome Organization (HUPO) - to create a standardized format for Mass Spectrometry data. This format is now deprecated, and replaced by mzML. + to create a standardized format for MS data. This format is now deprecated, and replaced by mzML. mzXML + mzxml mzXML is an open data format for storage and exchange of mass spectroscopy data, developed at the SPC/Institute for Systems Biology. - spectra - Plural of spectrum. - - mass spectrum - A mass spectrum is a plot of the ion signal as a function of the mass-to-charge ratio. A mass spectrum is produced by a single :term:`mass spectrometry` run. These spectra are used to determine the elemental or isotopic signature of a sample, the masses of particles and of molecules, and to elucidate the chemical identity or structure of molecules and other chemical compounds. OpenMS represents a one dimensional mass spectrum using the class `MSSpectrum `_. - - m/z - mass to charge ratio. - - retention time - retention time (RT) in liquid chromatography, is the time it takes for a separated analyte to move through the stationary phase. - ProteoWizard - ProteoWizard is a set of open-source, cross-platform tools and libraries for proteomics data analyses. It provides a framework for unified :term:`mass spectrometry` data file access and performs standard chemistry and LCMS dataset computations. + ProteoWizard is a set of open-source, cross-platform tools and libraries for proteomics data analyses. + It provides a framework for unified MS data file access and performs standard chemistry and LCMS dataset computations. PepNovo - PepNovo is a de novo sequencing algorithm for :term:`MS/MS` :term:`spectra`. + PepNovo is a de :term:`de novo peptide sequencing` algorithm for :term:`MS2` spectra. de novo peptide sequencing - A peptide’s amino acid sequence is inferred directly from the precursor peptide mass and tandem mass spectrum (:term:`MS/MS` or :term:`MS^3`) fragment ions, without comparison to a reference proteome. + A peptide’s amino acid sequence is inferred directly from the precursor peptide mass and tandem + mass spectrum (:term:`MS2` or :term:`MS3`) fragment ions, without comparison to a reference proteome. TOPPAS - An assistant for GUI-driven TOPP workflow design. It is recommended to use OpenMS through the KNIME plugins. - - chromatogram - A two-dimensional plot that describes the amount of analyte eluted from a chromatography versus the analyte's retention time. OpenMS represents a chromatogram using the class `MSChromatogram `_. + An assistant for GUI-driven :term:`TOPP` workflow design. It is recommended to use OpenMS through the KNIME plugins. KNIME An advanced workflow editor which OpenMS provides a plugin for. SILAC + stable isotope labeling with amino acids in cell culture Stands for Stable isotope labeling using amino acids in cell culture. iTRAQ Stands for isobaric tags for relative and absolute quantitation. TMT - Tandem Mass Tag (TMT) is a :term:`mass spectrometry` based system designed to identify and quantify proteins in different samples. + Tandem Mass Tag (TMT) is a MS based system designed to identify and quantify proteins in different samples. SRM - Selected reaction monitoring is a :term:`mass spectrometry` technique for small molecule analysis. + Selected reaction monitoring is a MS technique for small molecule analysis. SWATH Stands for sequential acquisition of all theoretical fragment ion spectra. @@ -165,27 +148,17 @@ A glossary of common terms used throughout OpenMS documentation. OpenMS API An interface that allows developers to use OpenMS core library classes and methods. - RT - Retention time. - - MS - Mass Spectrometry - - feature - An LC-MS feature represents the combined isotopic mass traces of a detected chemical compound. The chromatographic peak shape of a feature is defined by the interaction of the analyte with the LC column. Each feature contains information on retention time, mass-to-charge ratio, intensity and overall quality. OpenMS represents a feature using the class `Feature `_. - + feature maps feature map - A feature map is a collection of features identified in a mass spectrum from a single experiment. One feature map can contain many features. OpenMS represents a feature map using the class `FeatureMap `_. + A feature map is a collection of features identified in a mass spectrum from a single experiment. + One feature map can contain many features. OpenMS represents a feature map using the class `FeatureMap `_. + consensus features consensus feature - Features from replicate experiments with similar retention times and m/z values are linked and considered a consensus feature. A consensus feature contains information on the common retention time and m/z values as well as intensities for each sample. OpenMS represents a consensus feature using the class `ConsensusFeature `_. + Features from replicate experiments with similar retention times and m/z values are linked and considered a consensus feature. + A consensus feature contains information on the common retention time and m/z values as well as intensities for each sample. OpenMS represents a consensus feature using the class `ConsensusFeature `_. + consensus maps consensus map - A consensus map is a collection of :term:`consensus features ` identified from mass spectra across replicate experiments. One consensus map can contain many consensus features. OpenMS represents a consensus map using the class `ConsensusMap `_. - - peak - A single raw data point in a chromatogram or a mass spectrum. OpenMS represents a peak in a chromatogram using the class `ChromatogramPeak `_. OpenMS represents a single, one-dimensional peak in a mass spectrum using the class `PeakID `_. - - MSExperiment - An OpenMS class (:py:class:`~.MSExperiment`) used to represent a single :term:`mass spectrometry` run. `Read the documentation for further information `_. - + A consensus map is a collection of :term:`consensus features` identified from mass spectra across replicate experiments. + One consensus map can contain many consensus features. OpenMS represents a consensus map using the class `ConsensusMap `_. diff --git a/docs/source/id_by_mz.rst b/docs/source/identification_accurate_mass.rst similarity index 86% rename from docs/source/id_by_mz.rst rename to docs/source/identification_accurate_mass.rst index b3a06bfdb..cbd223858 100644 --- a/docs/source/id_by_mz.rst +++ b/docs/source/identification_accurate_mass.rst @@ -1,13 +1,13 @@ Identification by Accurate Mass =============================== -Example workflow for the processing of a set of mzML files (defined in the ``files`` variable) including centroiding, -feature detection, feature linking and accurate mass search. -The resulting data gets processed in a pandas data frame with feature filtering (missing values, quality) and imputation +Example workflow for the processing of a set of :term:`mzML` files (defined in the ``files`` variable) including centroiding, +features detection, :term:feature: linking and accurate mass search. +The resulting data gets processed in a pandas data frame with features filtering (missing values, quality) and imputation of remaining missing values. Compounds detected during accurate mass search will be annotated in the resulting dataframe. -Imports and mzML file path -************************** +Imports and :term:`mzML` file path +********************************** .. code-block:: python :linenos: @@ -15,17 +15,12 @@ Imports and mzML file path import os import shutil import requests - import pandas as pd - from pyopenms import * - import numpy as np - from sklearn.impute import KNNImputer from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import Pipeline - import plotly.graph_objects as go import plotly.express as px import matplotlib.pyplot as plt @@ -33,6 +28,7 @@ Imports and mzML file path # set path to your mzML files, or leave like this to use the example data files = os.path.join(os.getcwd(), "IdByMz_Example") + Download Example Data ********************* Execute this cell only for the example workflow. @@ -64,9 +60,9 @@ Centroiding *********** If files are already centroided this step can bet omitted. -in: path to MS data (files) +``in``: path to MS data (files) -out: path to centroided mzML files in a subfolder 'centroid' (files) +``out``: path to centroided :term:`mzML` files in a subfolder 'centroid' (files) .. code-block:: python :linenos: @@ -90,9 +86,9 @@ out: path to centroided mzML files in a subfolder 'centroid' (files) Feature Detection ***************** -in: path to centroid mzML files (files) +``in``: path to centroid :term:`mzML` files (files) -out: list of :py:class:`~.FeatureMap` (feature_maps) +``out``: list of :py:class:`~.FeatureMap` (feature_maps) .. code-block:: python :linenos: @@ -151,9 +147,9 @@ out: list of :py:class:`~.FeatureMap` (feature_maps) Feature Map Retention Time Alignment ************************************ -in: unaligned list of :py:class:`~.FeatureMap` (feature_maps) +``in``: unaligned list of :py:class:`~.FeatureMap` (feature_maps) -out: list of :py:class:`~.FeatureMap` aligned to the first feature map in the list (feature_maps) +``out``: list of :py:class:`~.FeatureMap` aligned to the first :term:`feature map` in the list (feature_maps) .. code-block:: python :linenos: @@ -178,7 +174,7 @@ out: list of :py:class:`~.FeatureMap` aligned to the first feature map in the li feature_map, trafo, True ) # store original RT as meta value -Visualization of RTs before and after alignment +Visualization of RTs before and after Alignment *********************************************** .. code-block:: python @@ -235,9 +231,9 @@ Visualization of RTs before and after alignment Feature Linking *************** -in: list of:py:class:`~.FeatureMap` (feature_maps) +``in``: list of:py:class:`~.FeatureMap` (feature_maps) -out: :py:class:`~.ConsensusMap` (consensus_map) +``out``: :py:class:`~.ConsensusMap` (consensus_map) .. code-block:: python :linenos: @@ -259,11 +255,11 @@ out: :py:class:`~.ConsensusMap` (consensus_map) consensus_map.setColumnHeaders(file_descriptions) feature_grouper.group(feature_maps, consensus_map) -ConsensusMap to pandas DataFrame +ConsensusMap to Pandas DataFrame ******************************** -in: :py:class:`~.ConsensusMap` (consensus_map) +``in``: :py:class:`~.ConsensusMap` (consensus_map) -out: DataFrame with RT, mz and quality from :py:class:`~.ConsensusMap` (cm_df) +``out``: DataFrame with RT, mz and quality from :py:class:`~.ConsensusMap` (cm_df) .. code-block:: python :linenos: @@ -278,9 +274,9 @@ out: DataFrame with RT, mz and quality from :py:class:`~.ConsensusMap` (cm_df) Accurate Mass Search ******************** -in: :py:class:`~.ConsensusMap` (consensus_map) +``in``: :py:class:`~.ConsensusMap` (consensus_map) -out: DataFrame with :py:class:`~.AccurateMassSearchEngine` results (ams_df) +``out``: DataFrame with :py:class:`~.AccurateMassSearchEngine` results (ams_df) .. code-block:: python :linenos: @@ -328,9 +324,9 @@ out: DataFrame with :py:class:`~.AccurateMassSearchEngine` results (ams_df) Data Filtering and Imputation ***************************** -in: unfiltered :py:class:`~.ConsensusMap` DataFrame (cm_df) +``in``: unfiltered :py:class:`~.ConsensusMap` DataFrame (cm_df) -out: features below minimum quality and with too many missing values removed, +``out``: features below minimum quality and with too many missing values removed, remaining missing values imputed with KNN algorithm (cm_df) .. code-block:: python @@ -368,11 +364,11 @@ remaining missing values imputed with KNN algorithm (cm_df) cm_df = imputer.fit_transform(cm_df) cm_df -Annotate features with identified compounds -******************************************* -in: :py:class:`~.ConsensusMap` DataFrame without identifications (cm_df) and AccurateMassSearch DataFrame (ams_df) +Annotate :term:Features` with Identified Compounds +************************************************************ +``in``: :py:class:`~.ConsensusMap` DataFrame without identifications (cm_df) and :py:class:`~.AccurateMassSearch` DataFrame (ams_df) -out: :py:class:`~.ConsensusMap` DataFrame with new identifications column (id_df) +``out``: :py:class:`~.ConsensusMap` DataFrame with new identifications column (id_df) .. code-block:: python :linenos: @@ -399,8 +395,8 @@ out: :py:class:`~.ConsensusMap` DataFrame with new identifications column (id_df id_df.to_csv(os.path.join(files, "result.tsv"), sep="\t", index=False) id_df -Visualize consensus features with identifications -************************************************* +Visualize :term:`Consensus Features` with Identifications +***************************************************************************** .. code-block:: python :linenos: diff --git a/docs/source/datastructures_id.rst b/docs/source/identification_data.rst similarity index 93% rename from docs/source/datastructures_id.rst rename to docs/source/identification_data.rst index cb21e479a..b315b36d9 100644 --- a/docs/source/datastructures_id.rst +++ b/docs/source/identification_data.rst @@ -5,26 +5,27 @@ In OpenMS, identifications of peptides, proteins and small molecules are stored in dedicated data structures. These data structures are typically stored to disc as idXML or mzIdentML file. The highest-level structure is :py:class:`~.ProteinIdentification`. It stores all identified proteins of an identification -run as ProteinHit objects plus additional metadata (search parameters, etc.). Each +run as :py:class:`~.ProteinHit` objects plus additional metadata (search parameters, etc.). Each :py:class:`~.ProteinHit` contains the actual protein accession, an associated score, and (optionally) the protein sequence. A :py:class:`~.PeptideIdentification` object stores the data corresponding to a single identified spectrum or feature. It has members for the retention time, m/z, and a vector of :py:class:`~.PeptideHit` objects. Each :py:class:`~.PeptideHit` -stores the information of a specific peptide-to-spectrum match or PSM (e.g., the score +stores the information of a specific :term:`peptide-spectrum match` or :term:`PSM` (e.g., the score and the peptide sequence). Each :py:class:`~.PeptideHit` also contains a vector of :py:class:`~.PeptideEvidence` objects which store the reference to one or more (in the case the peptide maps to multiple proteins) proteins and the position therein. .. NOTE:: - Protein Ids are linked to Peptide Ids by a common identifier (e.g., a unique string of time and date of the search). + Protein Ids are linked to peptide Ids by a common identifier (e.g., a unique string of time and date of the search). The Identifier can be set using the :py:meth:`~.ProteinIdentification.setIdentifier` method in :py:class:`~.ProteinIdentification` and :py:class:`~.PeptideIdentification`. Similarly :py:meth:`~.ProteinIdentification.getIdentifier` can be used to check the link between them. - With the link one can retrieve search meta data (which is stored at the protein level) for individual Peptide Ids. + With the link one can retrieve search meta data (which is stored at the protein level) for individual peptide Ids. .. code-block:: python + :linenos: protein_id = ProteinIdentification() peptide_id = PeptideIdentification() @@ -36,13 +37,14 @@ peptide maps to multiple proteins) proteins and the position therein. # Prints the Identifier print("Protein Identifier -", protein_id.getIdentifier()) print("Peptide Identifier -", peptide_id.getIdentifier()) + .. code-block:: output Protein Identifier - IdentificationRun1 Peptide Identifier - IdentificationRun1 -ProteinIdentification -********************** +Protein Identification +*********************** We can create an object of type :py:class:`~.ProteinIdentification` and populate it with :py:class:`~.ProteinHit` objects as follows: @@ -146,8 +148,8 @@ corresponding :py:class:`~.PeptideHit` objects: peptide_id.setHits([peptide_hit, peptide_hit2]) -This allows us to represent single spectra (:py:class:`~.PeptideIdentification` at *m/z* -440.0 and *rt* 1234.56) with possible identifications that are ranked by score. +This allows us to represent single spectra (:py:class:`~.PeptideIdentification` at m/z +:math:`440.0` and *rt* :math:`1234.56`) with possible identifications that are ranked by score. In this case, apparently two possible peptides match the spectrum which have the first three amino acids in a different order "DLQ" vs "QDL"). @@ -175,7 +177,7 @@ We can now display the peptides we just stored: -Storage on disk +Storage on Disk *************** Finally, we can store the peptide and protein identification data in a diff --git a/docs/source/index.rst b/docs/source/index.rst index 8ac5c6586..3af22620a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,28 +3,26 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Summary +Index ======= -pyOpenMS is an open-source Python library for :term:`mass spectrometry`, specifically for the analysis of proteomics -and metabolomics data in Python. pyOpenMS implements a set of Python bindings to the OpenMS library for computational -:term:`mass spectrometry` and is available for Windows, Linux and OSX. +pyOpenMS is an open-source Python library for mass spectrometry, specifically for the analysis of +proteomics and metabolomics data in Python. pyOpenMS implements a set of Python bindings to +the OpenMS library for computational mass spectrometry and is available for Windows, Linux and OSX. -PyOpenMS provides functionality that is commonly used in computational mass -spectrometry. The pyOpenMS package contains Python bindings for a large part of the `OpenMS `_ -library for :term:`mass spectrometry` based proteomics. It thus provides facile access to a feature-rich, open-source algorithm +PyOpenMS provides functionality that is commonly used in computational mass spectrometry. +The pyOpenMS package contains Python bindings for a large part of the `OpenMS `_ +library for mass spectrometry based proteomics. It thus provides access to a feature-rich, open-source algorithm library for mass-spectrometry based proteomics analysis. -pyOpenMS facilitates the execution of common tasks in proteomics (and other mass spectrometric fields) such as +pyOpenMS facilitates the execution of common tasks in proteomics (and other fields of mass spectrometry) such as -- File handling (:term:`mzXML`, :term:`mzML`, TraML, mzTab, :term:`FASTA `, pepxml, protxml, mzIdentML among others) +- File handling (:term:`mzXML`, :term:`mzML`, TraML, mzTab, :term:`FASTA`, pepxml, protxml, mzIdentML among others) - Chemistry (mass calculation, peptide fragmentation, isotopic abundances) - Signal processing (smoothing, filtering, de-isotoping, retention time correction and peak-picking) -- Identification analysis (including peptide search, PTM analysis, Cross-linked analytes, FDR control, -RNA oligonucleotide search and small molecule search tools) +- Identification analysis (including peptide search, PTM analysis, cross-linked analytes, FDR control, RNA oligonucleotide search and small molecule search tools) - Quantitative analysis (including label-free, metabolomics, :term:`SILAC`, :term:`iTRAQ` and :term:`SWATH`/DIA analysis tools) -- Chromatogram analysis (chromatographic peak picking, smoothing, elution profiles and peak scoring for -:term:`SRM`/MRM/PRM/:term:`SWATH`/DIA data) +- Chromatogram analysis (chromatographic peak picking, smoothing, elution profiles and peak scoring for :term:`SRM`/MRM/PRM/:term:`SWATH`/DIA data) - Interaction with common tools in proteomics and metabolomics - Search engines such as Comet, Crux, Mascot, MSGFPlus, MSFragger, Myrimatch, OMSSA, Sequest, SpectraST, XTandem @@ -49,15 +47,15 @@ RNA oligonucleotide search and small molecule search tools) :maxdepth: 2 :caption: Mass Spectrometry Concepts - datastructures_peak + ms_data chemistry - aasequences - nasequences - theoreticalspectrumgenerator - spectrumalignment + peptides_proteins + oligonucleotides_rna + fragment_spectrum_generation + spectrum_alignment digestion - datastructures_id - datastructures_quant + identification_data + quantitative_data .. toctree:: :maxdepth: 2 @@ -67,23 +65,23 @@ RNA oligonucleotide search and small molecule search tools) algorithms smoothing centroiding - normalization - deisotoping + spectrum_normalization + charge_isotope_deconvolution feature_detection map_alignment feature_linking peptide_search chromatographic_analysis - mzqc_export + quality_control mass_decomposition - GNPS_export + export_files_GNPS .. toctree:: :maxdepth: 2 :caption: Example Workflows - id_by_mz - metabolomics_preprocessing + identification_accurate_mass + untargeted_metabolomics_preprocessing .. toctree:: :maxdepth: 2 @@ -95,18 +93,18 @@ RNA oligonucleotide search and small molecule search tools) :maxdepth: 2 :caption: Advanced Topics - file_handling - other_file_handling - mzMLFileFormat - hyperscore - pandas_df_conversion - massql + reading_raw_ms_data + other_ms_data_formats + mzml_files + scoring_spectra_hyperscore + export_pandas_dataframe + query_msexperiment_massql memory_management pyopenms_in_r build_from_source - wrap_classes + wrapping_workflows_new_classes interactive_plots - ML_tutorial + interfacing_ml_libraries .. toctree:: :maxdepth: 2 @@ -123,7 +121,7 @@ RNA oligonucleotide search and small molecule search tools) glossary -Indices and tables +Indices and Tables ================== * :ref:`genindex` diff --git a/docs/source/interactive_plots.rst b/docs/source/interactive_plots.rst index 1dacf90f2..93750f1da 100644 --- a/docs/source/interactive_plots.rst +++ b/docs/source/interactive_plots.rst @@ -1,10 +1,10 @@ -Interactive plots +Interactive Plots ================= -With special plotting libraries like holoviews and datashader for big -data visualization as well as bokeh for interactiveness, we can use the -functionality of pyopenms to quickly create fully interactive views of -:term:`mass spectrometry` data. Here we plot a full map of MS1 that can be +With special plotting libraries like ``holoviews`` and ``datashader`` for big +data visualization as well as ``bokeh`` for interactiveness, we can use the +functionality of pyOpenMS to quickly create fully interactive views of +mass spectrometry data. Here we plot a full map of :term:`MS1` that can be interactively zoomed-in if you execute the code in a notebook (e.g. on Binder, see the button on top of the page). diff --git a/docs/source/ML_tutorial.rst b/docs/source/interfacing_ml_libraries.rst similarity index 92% rename from docs/source/ML_tutorial.rst rename to docs/source/interfacing_ml_libraries.rst index 096f65e55..d9c4588b8 100644 --- a/docs/source/ML_tutorial.rst +++ b/docs/source/interfacing_ml_libraries.rst @@ -1,4 +1,4 @@ -Interfacing with ML libraries +Interfacing with ML Libraries ============================= Overview @@ -6,13 +6,13 @@ Overview Machine Learning is the field of study that gives computers the capability to learn without being explicitly programmed. Machine learning (ML) is well known for its powerful ability to recognize -patterns and signals. Recently, the :term:`mass spectrometry` community has embraced ML techniques for large-scale data analysis. +patterns and signals. Recently, the mass spectrometry community has embraced ML techniques for large-scale data analysis. -Predicting accurate retention times has shown to improve identification in bottom-up proteomics. +Predicting accurate retention times (RT) has shown to improve identification in bottom-up proteomics. -In this tutorial we will predict the retention time from amino acid sequence data using simple machine learning methods. +In this tutorial we will predict the RT from amino acid sequence data using simple machine learning methods. -First, we import all neccessary libraries for this tutorial. +First, we import all necessary libraries for this tutorial. .. code-block:: ipython3 :linenos: @@ -40,19 +40,19 @@ Once we have imported all libraries successfully, we are going to store the data urlretrieve(gh + "/src/data/pyOpenMS_ML_Tutorial.tsv", "data.tsv") tsv_data = pd.read_csv("data.tsv", sep="\t", skiprows=17) -Here we have prepared a tsv file that contains three columns **sequence** , **charge** and **retention** time. +Here, we have prepared a ``tsv`` file that contains three columns ``sequence``, RT and ``charge``. Note that this table could also be easily created from identification data as produced in previous chapters. Before we move forward lets try to understand more about our data: a. Sequence - Chains of amino acids form peptides or proteins. -The arrangement of amino acids is reffered as amino acid sequence. +The arrangement of amino acids is referred as amino acid sequence. The composition and order of amino acids affect the physicochemical properties of the peptide and lead to different retention in the column. b. Retention time (RT) - is the time taken for an analyte to pass through a chromatography column. From the amino acid sequence we can derive additional properties (machine learning features) used to train -our machine learning model. +our model. We can easily check for its shape by using the tsv_data.shape attribute, which will return the size of the dataset. @@ -82,7 +82,7 @@ Explore the top 5 rows of the dataset by using head() method on pandas DataFrame 3 SGTHNMYK 625.982520 2 4 AARPTRPDK 626.073300 3 -As the RT column is our response variable, we will be storing it seperately as Y1_test +As the RT column is our response variable, we will be storing it separately as Y1_test .. code-block:: python :linenos: @@ -179,15 +179,17 @@ Modelling from sklearn.metrics import mean_squared_error from sklearn.model_selection import ShuffleSplit + .. code-block:: python :linenos: test_df = df.copy() test_df = test_df.drop("sequence", axis=1) + Now, we create the train and test set for cross-validation of the results using the ``train_test_split`` function from sklearn's model_selection module with test_size -size equal to 30% of the data. Also, to maintain reproducibility of the results, a random_state is also assigned. +size equal to 30% of the data. To maintain reproducibility of the results, a random_state is also assigned. .. code-block:: python :linenos: @@ -197,6 +199,7 @@ size equal to 30% of the data. Also, to maintain reproducibility of the results, test_df, Y1_test, test_size=0.3, random_state=3 ) + We will be using the ``XGBRegressor()`` class because it is clearly a regression problem as the response variable ( retention time ) is continuous. .. code-block:: python @@ -210,7 +213,8 @@ We will be using the ``XGBRegressor()`` class because it is clearly a regression max_depth=7, ) -Fit the regressor to the training set and make predictions on the test set using the familiar .fit() and .predict() methods. + +Fit the regressor to the training set and make predictions on the test set using the familiar ``.fit()`` and ``.predict()`` methods. .. code-block:: python :linenos: @@ -218,6 +222,7 @@ Fit the regressor to the training set and make predictions on the test set using xg_reg.fit(X_train, Y_train) Y_pred = xg_reg.predict(X_test) + Compute the root mean square error (rmse) using the mean_sqaured_error function from sklearn's metrics module. .. code-block:: python @@ -226,6 +231,7 @@ Compute the root mean square error (rmse) using the mean_sqaured_error function rmse = np.sqrt(mean_squared_error(Y_test, Y_pred)) print("RMSE: %f" % (rmse)) + .. code-block:: output RMSE: 437.017290 @@ -240,6 +246,7 @@ Store the **Observed** v/s **Predicted** value in pandas dataframe and print. ) print(k) + .. code-block:: output Observed Predicted @@ -255,9 +262,10 @@ Store the **Observed** v/s **Predicted** value in pandas dataframe and print. 4767 5515.94682 5491.597168 4768 2257.63092 2258.312988 + We will now generate a **Observed** v/s **Predicted** plot that gives a high level overview about the model performance. We can clearly see that only few outliers are there and most of them lie in between the central axis. -This means that prediction actually worked and observed and predicted value won't differ too much. +This means that prediction actually works and observed and predicted value won't differ too much. .. code-block:: python :linenos: @@ -266,6 +274,7 @@ This means that prediction actually worked and observed and predicted value won' x="Observed", y="Predicted", data=k, scatter_kws={"alpha": 0.2, "s": 5} ) + .. image:: img/ml_tutorial_predicted_vs_observed.png .. code-block:: python @@ -314,6 +323,7 @@ k-fold cross validation via the cv() method. All we have to do is specify the nf print("---------------------") print(df) + .. code-block:: output Fold-1 diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index edc920f86..f1a03fab4 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -14,10 +14,10 @@ Proteomics and metabolomics are interdisciplinary research fields that study the function and interaction of proteins and metabolites. They employ large-scale experimental techniques that allow acquiring data at the level of cellular systems to whole organisms. One of the main analytical method to identify, characterize or quantify -proteins and metabolites is :term:`mass spectrometry (MS)` combined with chromatographic +proteins and metabolites is mass spectrometry (MS) combined with chromatographic separation. -In :term:`mass spectrometry`-based proteomics and metabolomics, biological samples are +In mass spectrometry-based proteomics and metabolomics, biological samples are extracted, prepared, and separated to reduce sample complexity. The separated analytes are ionized and measured in the mass spectrometer. Mass and abundance of ions are stored in mass spectra and used to identify and quantify the analytes in the sample @@ -25,52 +25,52 @@ using computational methods. The quantity and identity of analytes can then be u in biomarker discovery, medical diagnostics, or basic research. -Liquid Chromatography ---------------------- -LC reduces the complexity of a sample by separating analytes +:term:`Liquid Chromatography` (:term:`LC`) +----------------------------------------------------------------- +:term:`LC` reduces the complexity of a sample by separating analytes based on their physicochemical properties. Separating analytes in time ensures that a manageable amount of analytes elute at the same time. Ideally, the amount is such that each peak in the mass spectrum corresponds to one single analyte. -In :term:`mass spectrometry`-based proteomics, (high-pressure) liquid chromatographic -separation techniques (HPLC) achieve a high degree of -separation. In HPLC, analytes are dissolved in a pressurized solvent (mobile phase) +In mass spectrometry-based proteomics, :term:`high performance liquid chromatography` +separation techniques (:term:`HPLC`) achieve a high degree of +separation. In :term:`HPLC`, analytes are dissolved in a pressurized solvent (mobile phase) and pumped through a solid adsorbent material (stationary phase) packed into a capillary column. Physicochemical properties of the analyte determine how strongly it -interacts with the stationary phase. The most common HPLC technique in proteomics +interacts with the stationary phase. The most common :term:`HPLC` technique in proteomics and metabolomics uses reversed-phase chromatography (RPC). RPC employs a hydrophobic -stationary phase like octadecyl (C18), a nonpolar carbon chain bonded to a silica base, +stationary phase like :term:`octadecyl` (:term:`C18`), a nonpolar carbon chain bonded to a silica base, and a polar mobile phase (solvent). Polar molecules interact weakly with the stationary phase and elute earlier, while non-polar molecules are retained. Interaction can be further modulated by changing the gradient of solvent concentration in the mobile phase -over time. Elution times in LC are inherently prone to variation, for example, due -to fluctuations in the flow rate of the mobile phase or change of the column. Retention -time shifts between runs may be compensated using computational chromatographic +over time. Elution times in :term:`LC` are inherently prone to variation, for example, due +to fluctuations in the flow rate of the mobile phase or change of the column. Retention time shifts between runs may be compensated using computational chromatographic retention time alignment methods. In the :term:`LC-MS` setup, the column is directly coupled to the ion source of the mass spectrometer. .. image:: img/introduction_LC.png -Mass Spectrometry ------------------ +Mass Spectrometry (MS) +--------------------------------------------------------- + MS is an analytical technique used to determine the mass of molecules. In order to -achieve accurate and sensitive mass measurements at the atomic scale, mass -spectrometers manipulate charged ions using magnetic and electrostatic fields. +achieve accurate and sensitive mass measurements at the atomic scale, mass spectrometer +manipulate charged ions using magnetic and electrostatic fields. .. image:: img/introduction_MS.png In a typical mass spectrometer, three principal components can be identified: -* Ion Source: A mass spectrometer only handles ions. Thus, charge needs first be transferred to uncharged analytes. The component responsible for the ionization is the ion source. Different types of ion sources and ionization techniques exist with electrospray ionization (ESI) being currently the most widely used ionization technique. +* Ion Source: A mass spectrometer only handles ions. Thus, charge needs first be transferred to uncharged analytes. The component responsible for the ionization is the ion source. Different types of ion sources and ionization techniques exist with :term:`electrospray ionization` (:term:`ESI`) being currently the most widely used ionization technique. -* Mass Analyzer: the most most commonly used mass analyzers are time-of-flight (TOF), quadrupole mass, and orbitrap analyzers. In TOF mass analyzers, the ions are accelerated in an electric field. The flight time of an ion is used to calculate the mass-to-charge ratio (m/z). Varying the electric field allows filtering certain mass-to-charge ratios before they enter the detector. In quadrupole mass filters, ions pass through an oscillating electric field created by four parallel rods. For a particular field, only ions in a certain mass-to-charge range will reach the detector. The orbitrap traps ions in orbital motion between a barrel-like outer electrode and a spindle-like central electrode allowing for prolonged mass measurement. As a result of the prolonged measurement, a high mass resolution can be achieved at the expense of a smaller throughput. +* Mass Analyzer: the most most commonly used mass analyzers are :term:`time-of-flight` (:term:`TOF`), :term:`quadrupole` mass, and :term:`orbitrap` analyzers. In :term:`TOF` mass analyzers, the ions are accelerated in an electric field. The flight time of an ion is used to calculate the mass-to-charge ratio (m/z). Varying the electric field allows filtering certain mass-to-charge ratios before they enter the detector. In :term:`quadrupole` mass filters, ions pass through an oscillating electric field created by four parallel rods. For a particular field, only ions in a certain mass-to-charge range will reach the detector. The :term:`orbitrap` traps ions in orbital motion between a barrel-like outer electrode and a spindle-like central electrode allowing for prolonged mass measurement. As a result of the prolonged measurement, a high mass resolution can be achieved at the expense of a smaller throughput. * Detector: The last component of the mass spectrometer is the detector. It determines the abundance of ions that passed through the mass analyzer. Ion intensities (a value that relates to its abundance) and the mass-to-charge ratio are recorded in a mass spectrum. -A sample is measured over the retention time of the chromatography typically resulting in tens of thousands of spectra. The measurement of one sample is called an MS run and the set of spectra called an MS or peakmap. +A sample is measured over the retention time of the chromatography typically resulting in tens of thousands of mass spectra. The measurement of one sample is called an MS run and the set of mass spectra called an MS or peak map. .. figure:: img/spectrum_peakmap.png - Left: spectrum with peaks (m/z and intensity values), Right: spectra stacked in retention time yield a peak map. The spectrum in the peak map at the retention time indicated by the red line in the right panel is plotted as a spectrum (intensity over m/z) in the left panel. + Left: mass spectrum with peaks (m/z and intensity values), Right: mass spectra stacked in retention time yield a peak map. The mass spectrum in the peak map at the retention time indicated by the red line in the right panel is plotted as a mass spectrum (intensity over m/z) in the left panel. -Identification of an analyte based on the mass spectrum (mass-to-charge ratio and isotope pattern) can be ambiguous. To improve identification, tandem mass spectrometry (MS2) can be applied to assess the analyte substructure. With MS2 spectrometry, an ion is isolated, fragmented using an inert gas by collision-induced fragmentation (CID) and a second mass spectrum is recorded from the ion fragments. In this context, the primary ion is called the precursor ion, the primary spectrum is called an MS1 spectrum and and the spectrum from the fragments is called an MS2 (MS/MS) spectrum. Tandem mass spectrometry is especially useful for linear polymers like proteins, RNA and DNA and the fragments typically break the polymer into two parts. For example, peptides (short strands of amino acids, part of a protein) typically break between each of the amino acids, leading to a so-called ion ladder where the distance between each peak in the MS2 spectrum reveals the identity of the amino acid, as most amino acids have different masses. +Identification of an analyte based on the mass spectrum (mass-to-charge ratio and isotope pattern) can be ambiguous. To improve identification, :term:`tandem mass spectrometry` (:term:`MS2`) can be applied to assess the analyte substructure. With :term:`MS2` spectrometry, an ion is isolated, fragmented using an inert gas by collision-induced fragmentation (CID) and a second mass spectrum is recorded from the ion fragments. In this context, the primary ion is called the precursor ion, the primary spectrum is called an :term:`MS1` spectrum and and the spectrum from the fragments is called an :term:`MS2` spectrum. :term:`MS2` is especially useful for linear polymers like proteins, RNA and DNA and the fragments typically break the polymer into two parts. For example, peptides (short strands of amino acids, part of a protein) typically break between each of the amino acids, leading to a so-called ion ladder where the distance between each peak in the :term:`MS2` spectrum reveals the identity of the amino acid, as most amino acids have different masses. diff --git a/docs/source/map_alignment.rst b/docs/source/map_alignment.rst index 0c857aac2..cba202366 100644 --- a/docs/source/map_alignment.rst +++ b/docs/source/map_alignment.rst @@ -1,11 +1,11 @@ Map Alignment =============== -The pyOpenMS map alignment algorithms transform different maps (peak maps, feature maps) to a common retention time axis. +The pyOpenMS map alignment algorithms transform different maps (peak maps, :term:`feature maps`) to a common retention time axis. .. image:: img/map_alignment_illustration.png -Note: Creating a consensus map from the aligned maps is performed by a feature linking algorithm (see next chapter). +Note: Creating a :term:`consensus map` from the aligned maps is performed by a features linking algorithm (see next chapter). Different map alignment algorithms are available in pyOpenMS: @@ -17,7 +17,7 @@ Different map alignment algorithms are available in pyOpenMS: - :py:class:`~.MapAlignmentTransformer` To perform a simple linear alignment we can employ the algorithm :py:class:`~.MapAlignmentAlgorithmPoseClustering`. -In the example below it is used for the alignment of feature maps. +In the example below it is used for the alignment of :term:`feature maps`. Download Example Data ********************* @@ -49,8 +49,8 @@ Download Example Data Map Alignment Algorithm *********************** -From the list of feature maps, the one with the largest number of features is selected for reference. -The retention times of the other feature maps are aligned to this. +From the list of :term:`feature maps`, the one with the largest number of features is selected for reference. +The retention times of the other :term:`feature maps` are aligned to this. .. code-block:: python @@ -78,7 +78,7 @@ The retention times of the other feature maps are aligned to this. Visualization ************* -Plotting consensus maps with features before and after alignment. +Plotting :term:`consensus maps` with features before and after alignment. .. code-block:: python diff --git a/docs/source/mass_decomposition.rst b/docs/source/mass_decomposition.rst index 55ef97894..1f2c8bc69 100644 --- a/docs/source/mass_decomposition.rst +++ b/docs/source/mass_decomposition.rst @@ -1,13 +1,13 @@ Mass Decomposition ================== -Fragment mass to amino acid composition +Fragment Mass to Amino Acid Composition *************************************** -One challenge often encountered in :term:`mass spectrometry` is the question of the +One challenge often encountered in mass spectrometry is the question of the composition of a specific mass fragment only given its mass. For example, for -the internal fragment mass ``262.0953584466`` there are three different -interpretations within a narrow mass band of 0.05 Th: +the internal fragment mass :math:`262.0953584466` there are three different +interpretations within a narrow mass band of :math:`0.05\ Th`: .. code-block:: python @@ -47,12 +47,12 @@ potential amino acid combinations that explain a certain mass in the for d in decomps: print(d.toExpandedString()) -Which outputs the three potential compositions for the mass ``262.0953584466``. +Which outputs the three potential compositions for the mass :math:`262.0953584466`. Note that every single combination of amino acids is only printed once, e.g. only ``DF`` is reported while the isobaric ``FD`` is not reported. This makes the algorithm more efficient. -Naive algorithm +Naive Algorithm *************** We can compare this result with a more naive algorithm which simply iterates @@ -83,7 +83,7 @@ Note that this approach is substantially slower than the OpenMS algorithm and also does not treat ``DF`` and ``FD`` as equivalent, instead outputting them both as viable solutions. -Stand-alone Program +Stand-Alone Program ******************* We can use pyOpenMS to write a short program that takes a mass and outputs all @@ -122,8 +122,8 @@ line 8 and 9). We can call it as follows: python mass_decomposition.py 999.4773990735001 0.001 Try to change the tolerance parameter. The parameter has a very large influence -on the reported results, for example for ``1.0`` tolerance, the algorithm will -produce 80 463 results while for a ``0.001`` tolerance, only 911 results are +on the reported results, for example for :math:`1.0` tolerance, the algorithm will +produce :math:`80,463` results while for a :math:`0.001` tolerance, only :math:`911` results are expected. Spectrum Tagger diff --git a/docs/source/memory_management.rst b/docs/source/memory_management.rst index e0d0f3ff9..fed2dfd42 100644 --- a/docs/source/memory_management.rst +++ b/docs/source/memory_management.rst @@ -1,8 +1,8 @@ -Memory management +Memory Management ================== On order to save memory, we can avoid loading the whole file into memory and -use the OnDiscMSExperiment for reading data. +use the :py:class:`~.OnDiscMSExperiment` for reading data. .. code-block:: python :linenos: @@ -41,6 +41,6 @@ by using del consumer Make sure you do not forget ``del consumer`` since otherwise the final part of -the mzML may not get written to disk (and the consumer is still waiting for new +the :term:`mzML` may not get written to disk (and the consumer is still waiting for new data). diff --git a/docs/source/datastructures_peak.rst b/docs/source/ms_data.rst similarity index 86% rename from docs/source/datastructures_peak.rst rename to docs/source/ms_data.rst index 20b442eb5..bc6890f78 100644 --- a/docs/source/datastructures_peak.rst +++ b/docs/source/ms_data.rst @@ -9,13 +9,13 @@ have already worked with in the `Getting Started `_ tutorial. :py:class:`~.MSSpectrum` is a container for 1-dimensional peak data (a container of :py:class:`~.Peak1D`). You can access these objects directly, by using an iterator or indexing. Meta-data is accessible through inheritance of the :py:class:`~.SpectrumSettings` -objects which handles meta data of a spectrum. +objects which handles meta data of a mass spectrum. In the following example program, a :py:class:`~.MSSpectrum` is filled with peaks, sorted according to mass-to-charge ratio and a selection of peak positions is displayed. -First we create a spectrum and insert peaks with descending mass-to-charge ratios: +First we create a mass spectrum and insert peaks with descending mass-to-charge ratios: .. code-block:: python :linenos: @@ -86,15 +86,15 @@ production code. To discover the full set of functionality of :py:class:`~.MSSpectrum`, we use the Python :py:func:`~.help` function. In particular, we find several important sets of meta -information attached to the spectrum including retention time, the ms level -(MS1, MS2, ...), precursor ion, ion mobility drift time and extra data arrays. +information attached to the mass spectrum including retention time, the MS level +(:term:`MS1`, :term:`MS2`, ...), precursor ion, ion mobility drift time and extra data arrays. .. code-block:: python :linenos: help(MSSpectrum) -We now set several of these properties in a current MSSpectrum: +We now set several of these properties in a current :py:class:`~.MSSpectrum`: .. code-block:: python :linenos: @@ -158,34 +158,35 @@ We now set several of these properties in a current MSSpectrum: scan polarity: positive -We have created a single spectrum and set basic spectrum properties (drift +We have created a single mass spectrum and set basic mass spectrum properties (drift time, retention time, MS level, precursor charge, isolation window and activation energy). Additional instrument settings allow to set e.g. the polarity of the Ion source). -We next add actual peaks into the spectrum (a single peak at 401.5 *m/z* and 900 intensity). -Additional metadata can be stored in data arrays for each peak +We next add actual peaks into the spectrum (a single peak at Lmath:`401.5` m/z and :math:`900\ intensity`). +Additional metadata can be stored in data arrays for each peak (e.g. use cases care peak annotations or "Signal to Noise" values for each peak. Finally, we add the spectrum to an :py:class:`~.MSExperiment` container to save it using the -:py:class:`~.MzMLFile` class in a file called "testfile.mzML". +:py:class:`~.MzMLFile` class in a file called ``testfile.mzML``. -You can now open the resulting spectrum in a spectrum viewer. We use the OpenMS -viewer ``TOPPView`` (which you will get when you install OpenMS from the -official website) and look at our spectrum: +You can now open the resulting mass spectrum in a mass spectrum viewer. We use the OpenMS +viewer :term:`TOPPView` (which you will get when you install OpenMS from the +official website) and look at our mass spectrum: .. image:: img/spectrum1.png -TOPPView displays our spectrum with its single peak at 401.5 *m/z* and it -also correctly displays its retention time at 205.2 seconds and precursor -isolation target of 600.0 *m/z*. Notice how TOPPView displays the information -about the S/N for the peak (S/N = 15) and its annotation as ``y15++`` in the status -bar below when the user clicks on the peak at 401.5 *m/z* as shown in the +:term:`TOPPView` displays our mass spectrum with its single peak at :math:`401.5\ m/z` and it +also correctly displays its retention time at :math:`205.2\ seconds` and precursor +isolation target of :math:`600.0/ m/z`. Notice how :term:`TOPPView` displays the information +about the S/N for the peak (S/N = 15) and its annotation as :chem:`y15++` in the status +bar below when the user clicks on the peak at :math:`401.5\ m/z` as shown in the screenshot. -We can also visualize our spectrum from before using the :py:func:`~.plot_spectrum` function from the +We can also visualize our mass spectrum from before using the :py:func:`~.plot_spectrum` function from the `spectrum_utils `_ visualization library: .. code-block:: python :linenos: + import matplotlib.pyplot as plt from pyopenms.plotting import plot_spectrum import matplotlib.pyplot as plt @@ -197,7 +198,7 @@ We can also visualize our spectrum from before using the :py:func:`~.plot_spectr Chromatogram -************ +********************************** An additional container for raw data is the :py:class:`~.MSChromatogram` container, which is highly analogous to the :py:class:`~.MSSpectrum` container, but contains an array of @@ -301,9 +302,9 @@ is highly analogous to the :py:class:`~.MSSpectrum` container, but contains an a Access an individual peak by index 800.0 0.4111122786998749 -This shows how the :py:class:`~.MSExperiment` class can hold spectra as well as chromatograms. +This shows how the :py:class:`~.MSExperiment` class can hold mass spectra as well as chromatograms . -Again we can visualize the resulting data using ``TOPPView`` using its chromatographic viewer +Again we can visualize the resulting data using :term:`TOPPView` using its chromatographic viewer capability, which shows the peak over retention time: .. image:: img/chromatogram1.png @@ -324,11 +325,11 @@ the :py:class:`~.MSExperiment` class), which we have already encountered above. :py:class:`~.MSExperiment` class can hold a list of :py:class:`~.MSSpectrum` object (as well as a list of :py:class:`~.MSChromatogram` objects, see below). The :py:class:`~.MSExperiment` object holds such peak maps as well as meta-data about the injection. Access to -individual spectra is performed through :py:meth:`~.MSExperiment.getSpectrum` and +individual mass spectra is performed through :py:meth:`~.MSExperiment.getSpectrum` and :py:meth:`~.MSExperiment.getChromatogram`. In the following code, we create an :py:class:`~.MSExperiment` and populate it with -several spectra: +several mass spectra: .. code-block:: python :linenos: @@ -389,10 +390,10 @@ several spectra: In the above code, we create six instances of :py:class:`~.MSSpectrum` (line 4), populate -it with three peaks at 500, 900 and 100 *m/z* and append them to the -:py:class:`~.MSExperiment` object (line 13). We can easily iterate over the spectra in +it with three peaks at :math:`500`, :math:`900` and :math:`100` m/z and append them to the +:py:class:`~.MSExperiment` object (line 13). We can easily iterate over the mass spectra in the whole experiment by using the intuitive iteration on lines 16-19 or we can -use list comprehensions to sum up intensities of all spectra that fulfill +use list comprehensions to sum up intensities of all mass spectra that fulfill certain conditions: .. code-block:: python @@ -415,7 +416,7 @@ certain conditions: 700.0 -We could store the resulting experiment containing the six spectra as mzML +We could store the resulting experiment containing the six mass spectra as mzML using the :py:class:`~.MzMLFile` object: .. code-block:: python @@ -424,7 +425,7 @@ using the :py:class:`~.MzMLFile` object: # Store as mzML MzMLFile().store("testfile2.mzML", exp) -Again we can visualize the resulting data using ``TOPPView`` using its 3D +Again we can visualize the resulting data using :term:`TOPPView` using its 3D viewer capability, which shows the six scans over retention time where the traces first increase and then decrease in intensity: @@ -486,7 +487,7 @@ provided by OpenMS. .. image:: img/Spectra2DDetails.png For larger data sets this will be too slow since every individual peak gets displayed. -However, we can use :py:class:`~.BilinearInterpolation` which produces an overview image of our spectra. +However, we can use :py:class:`~.BilinearInterpolation` which produces an overview image of our mass spectra. This can be useful for a brief visual inspection of your sample in quality control. .. code-block:: python @@ -541,7 +542,7 @@ This can be useful for a brief visual inspection of your sample in quality contr Example: Precursor Purity ************************** -When an MS2 spectrum is generated, the precursor from the MS1 spectrum is gathered, fragmented and measured. +When an :term:`MS2` spectrum is generated, the precursor from the :term:`MS1` spectrum is gathered, fragmented and measured. In practice, the instrument gathers the ions in a user-defined window around the precursor m/z - the so-called precursor isolation window. @@ -620,11 +621,11 @@ We could assess that we have four other non-isotopic peaks apart from our precur The signal of the isotopic peaks correspond to roughly 78% of all intensities in the precursor isolation window. -Example: Filtering Spectra -************************** +Example: Filtering Mass Spectra +******************************* Here we will look at some code snippets that might come in handy -when dealing with spectra data. +when dealing with mass spectra data. But first, we will load some test data: @@ -638,11 +639,12 @@ But first, we will load some test data: MzMLFile().load("test.mzML", inp) -Filtering Spectra by MS level -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Filtering Mass Spectra by :term`MS` Level +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -We will filter the data from "test.mzML" file by only retaining -only spectra that are not MS1 spectra (e.g.\ MS2, MS3 or MSn spectra): +We will filter the data from ``test.mzML`` file by only retaining +mass spectra that are not :term:`MS1` spectra +(e.g. :term:`MS2`, :term:`MS3` or MSn spectra): .. code-block:: python :linenos: @@ -655,10 +657,10 @@ only spectra that are not MS1 spectra (e.g.\ MS2, MS3 or MSn spectra): # filtered now only contains spectra with MS level > 2 -Filtering by scan number +Filtering by Scan Number ~~~~~~~~~~~~~~~~~~~~~~~~ -We could also use a list of scan numbers as filter criterium +We could also use a list of scan numbers as filter criterion to only retain a list of MS scans we are interested in: .. code-block:: python @@ -672,10 +674,10 @@ to only retain a list of MS scans we are interested in: filtered.addSpectrum(s) -Filtering Spectra and Peaks -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Filtering Mass Spectra and Peaks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Suppose we are interested in only in a small m/z window of our fragment ion spectra. +Suppose we are interested in only in a small m/z window of our fragment ion mass spectra. We can easily filter our data accordingly: .. code-block:: python @@ -699,10 +701,10 @@ We can easily filter our data accordingly: Note that in a real-world application, we would set the ``mz_start`` and ``mz_end`` parameter to an actual area of interest, for example the area -between 125 and 132 which contains quantitative ions for a TMT experiment. +between 125 and 132 which contains quantitative ions for a :term:`TMT` experiment. Similarly we could only retain peaks above a certain -intensity or keep only the top N peaks in each spectrum. +intensity or keep only the top N peaks in each mass spectrum. For more advanced filtering tasks pyOpenMS provides special algorithm classes. We will take a closer look at some of them in the algorithm section. diff --git a/docs/source/mzMLFileFormat.rst b/docs/source/mzml_files.rst similarity index 91% rename from docs/source/mzMLFileFormat.rst rename to docs/source/mzml_files.rst index 8ca05055f..500ee5788 100644 --- a/docs/source/mzMLFileFormat.rst +++ b/docs/source/mzml_files.rst @@ -1,22 +1,22 @@ -mzML files -========== +:term:`mzML` Files +================== .. NOTE:: - This is an advanced section that dives deep into the mzML format and we + This is an advanced section that dives deep into the :term:`mzML` format and we will investigate the file format in greater detail. The intricacies of the - mzML file format are all handled by pyOpenMS internally + :term:`mzML` file format are all handled by pyOpenMS internally and this section is only intended for the interested reader -Specifically, we will look at mzML stores raw spectral data and how this data -is encoded in the XML format. The mzML standard is developed by the HUPO-PSI +Specifically, we will look at :term:`mzML` stores raw spectral data and how this data +is encoded in the XML format. The :term:`mzML` standard is developed by the HUPO-PSI committee and can be read on the `official mzML website `_. It describes how to store the meta data and the raw data for spectra and chromatograms. In short, the standard uses XML to encode all meta data and stores the raw data using `Base64 encoding `_. -Binary encoding +Binary Encoding --------------- :index:`To proceed `, we will download an example file: @@ -80,10 +80,10 @@ We now see that the data encoded describes 10 m/z data points that are equally spaced in intervals of two, starting from 0 m/z and ending at 18 m/z (note: this is a synthetic dataset). -Base64 encoding +Base64 Encoding --------------- -From the mzML standard, we know that the array is :index:`base64 ` encoded and we can now try to decode this data ourselves. We will first use pure Python functions : @@ -169,12 +169,12 @@ This allows us thus to manually decode the data. We can use pyOpenMS to encode a b'eJxjYAADBwaGBiA+AMQMjgwMCkDsAMQJQNwAxBMcAVbKBVc=' Note how encoding the data with 64 bit precision results in an output string of -length 108 characters that is about twice as long compared to encoding the data -with 32 bit precision which is of length 56 characters. However, this +length :math:`108` characters that is about twice as long compared to encoding the data +with 32 bit precision which is of length :math:`56` characters. However, this difference disappears when zlib compression is used and the resulting string is shorter still. -numpress encoding +Numpress Encoding ----------------- We can do even better, using the :index:`numpress ` compression. The numpress algorithm diff --git a/docs/source/nasequences.rst b/docs/source/oligonucleotides_rna.rst similarity index 94% rename from docs/source/nasequences.rst rename to docs/source/oligonucleotides_rna.rst index 6637b81ac..694d1f4de 100644 --- a/docs/source/nasequences.rst +++ b/docs/source/oligonucleotides_rna.rst @@ -77,7 +77,7 @@ The :py:class:`~.NASequence` object also allows iterations directly in Python: for ribo in oligo: print(ribo.getName()) -Fragment ions +Fragment Ions ~~~~~~~~~~~~~ Similarly to before for amino acid sequences, we can also generate internal fragment ions: @@ -100,13 +100,13 @@ Similarly to before for amino acid sequences, we can also generate internal frag print("RNA Oligo w4++ ion", suffix, "has mz", mz) print("RNA Oligo w4++ ion", suffix, "has molecular formula", w4_formula) -Modified oligonucleotides +Modified Oligonucleotides ************************* -Modified nucleotides can also represented by the :py:class:`~.Ribonucleotide` class and +Modified nucleotides can also be represented by the :py:class:`~.Ribonucleotide` class and are specified using a unique string identifier present in the -:py:class:`~.RibonucleotideDB` in square brackets. For example, ``[m1A]`` represents -1-methyladenosine. We can create a :py:class:`~.NASequence` object by parsing a modified +:py:class:`~.RibonucleotideDB` in square brackets. For example, :chem:`[m1A]` represents +1-methyl-adenosine. We can create a :py:class:`~.NASequence` object by parsing a modified sequence as follows: .. code-block:: python diff --git a/docs/source/other_file_handling.rst b/docs/source/other_ms_data_formats.rst similarity index 93% rename from docs/source/other_file_handling.rst rename to docs/source/other_ms_data_formats.rst index 3ff293194..04aac31f9 100644 --- a/docs/source/other_file_handling.rst +++ b/docs/source/other_ms_data_formats.rst @@ -1,12 +1,13 @@ -Other MS data formats -======================= +Other MS Data Formats +============================= -Identification data (idXML, mzIdentML, pepXML, protXML) +Identification Data (idXML, mzIdentML, pepXML, protXML) ------------------------------------------------------- You can store and load identification data from an `idXML` file as follows: .. code-block:: python + :linenos: from urllib.request import urlretrieve from pyopenms import * @@ -21,6 +22,7 @@ You can store and load identification data from an `idXML` file as follows: You can store and load identification data from an `mzIdentML` file as follows: .. code-block:: python + :linenos: from urllib.request import urlretrieve @@ -37,6 +39,7 @@ You can store and load identification data from an `mzIdentML` file as follows: You can store and load identification data from a TPP `pepXML` file as follows: .. code-block:: python + :linenos: from urllib.request import urlretrieve @@ -51,6 +54,7 @@ You can store and load identification data from a TPP `pepXML` file as follows: You can load (storing is not supported) identification data from a TPP `protXML` file as follows: .. code-block:: python + :linenos: from urllib.request import urlretrieve @@ -63,12 +67,12 @@ You can load (storing is not supported) identification data from a TPP `protXML` .. ProtXMLFile().store("test.out.protXML", protein_ids, peptide_ids, "doc_id_42") -note how each data file produces two vectors of type :py:class:`~.ProteinIdentification` +Note how each data file produces two vectors of type :py:class:`~.ProteinIdentification` and :py:class:`~.PeptideIdentification` which also means that conversion between two data types is trivial: load data from one data file and use the storage function of the other file. -Quantiative data (featureXML, consensusXML) +Quantiative Data (featureXML, consensusXML) ------------------------------------------------------- OpenMS stores quantitative information in the internal ``featureXML`` and @@ -121,7 +125,7 @@ Transition data (TraML) ------------------------------------------------------- The TraML data format allows you to store transition information for targeted -experiments (SRM / MRM / PRM / DIA). +experiments (:term:`SRM` / MRM / PRM / DIA). .. code-block:: python :linenos: diff --git a/docs/source/parameter_handling.rst b/docs/source/parameter_handling.rst index 13a60fc21..e4312e8c0 100644 --- a/docs/source/parameter_handling.rst +++ b/docs/source/parameter_handling.rst @@ -50,6 +50,7 @@ The parameters can then be accessed as The param object can be copy and merge in to other param object as .. code-block:: python + :linenos: # print the key and values pairs stored in a Param object def printParamKeyAndValues(p): @@ -79,6 +80,7 @@ The param object can be copy and merge in to other param object as In param object the keys values can be remove by key_name or prefix as .. code-block:: python + :linenos: # We now call the remove method with key of the entry we want to delete ("example3") new_p.remove("example3") diff --git a/docs/source/peptide_search.rst b/docs/source/peptide_search.rst index b52b384d9..4e3443070 100644 --- a/docs/source/peptide_search.rst +++ b/docs/source/peptide_search.rst @@ -1,10 +1,10 @@ Peptide Search ============== -In MS-based proteomics, fragment ion spectra (MS2 spectra) are often -interpreted by comparing them against a theoretical set of spectra generated -from a FASTA database. OpenMS contains a (simple) implementation of such a -"search engine" that compares experimental spectra against theoretical spectra +In MS-based proteomics, fragment ion mass spectra (:term:`MS2`) are often +interpreted by comparing them against a theoretical set of mass spectra generated +from a :term:`FASTA` database. OpenMS contains a (simple) implementation of such a +"search engine" that compares experimental against theoretical mass spectra generated from an enzymatic or chemical digest of a proteome (e.g. tryptic digest). @@ -18,7 +18,7 @@ In most proteomics applications, a dedicated search engine (such as Comet, Crux, Mascot, MSGFPlus, MSFragger, Myrimatch, OMSSA, SpectraST or XTandem; all of which are supported by pyOpenMS) will be used to search data. Here, we will use the internal :py:class:`~.SimpleSearchEngineAlgorithm` from OpenMS used for teaching -purposes. This makes it very easy to search an (experimental) mzML file against +purposes. This makes it very easy to search an (experimental) :term:`mzML` file against a fasta database of protein sequences: .. code-block:: python @@ -37,7 +37,7 @@ a fasta database of protein sequences: ) This will print search engine output including the number of peptides and -proteins in the database and how many spectra were matched to peptides and +proteins in the database and how many mass spectra were matched to peptides and proteins: .. code-block:: console @@ -51,11 +51,11 @@ proteins: match to both : 0 (0 %) -PSM inspection -************** +:term:`Peptide-Spectrum Match` (:term:`PSM`) Inspection +******************************************************************************* We can now investigate the individual hits as we have done before in the -`Identification tutorial `_. +`identification tutorial `_. .. code-block:: python :linenos: @@ -86,8 +86,8 @@ We can now investigate the individual hits as we have done before in the print(" - Peptide hit score:", hit.getScore()) -We notice that the second peptide spectrum match (PSM) was found for the third -spectrum in the file for a precursor at 775.38 m/z for the sequence +We notice that the second :term:`PSM` was found for the third +term:`mass spectrum` in the file for a precursor at :math:`775.38` m/z for the sequence ``RPGADSDIGGFGGLFDLAQAGFR``. .. code-block:: python @@ -112,26 +112,26 @@ spectrum in the file for a precursor at 775.38 m/z for the sequence for peak in peaks: print(peak[0], "mz", peak[1], "int") -Comparing the theoretical spectrum and the experimental spectrum for -``RPGADSDIGGFGGLFDLAQAGFR`` we can easily see that the most abundant ions in the -spectrum are y8 (877.452 m/z), b10 (926.432), y9 (1024.522 m/z) and b13 -(1187.544 m/z). +Comparing the theoretical and the experimental mass spectrum for +``RPGADSDIGGFGGLFDLAQAGFR`` we can easily see that the most abundant ions in are +:chem:`y8` (:chem:`877.452` m/z), :chem:`b10` (:math:`926.432`), :chem:`y9` +(:math:`1024.522` m/z) and :chem:`b13` (:math:`1187.544` m/z). Visualization ************* When loading the ``searchfile.mzML`` into the OpenMS -visualization software TOPPView, we can convince ourselves that the observed -spectrum indeed was generated by the peptide ``RPGADSDIGGFGGLFDLAQAGFR`` by loading -the corresponding theoretical spectrum into the viewer using "Tools"->"Generate +visualization software :term:`TOPPView`, we can convince ourselves that the observed +mass spectrum indeed was generated by the peptide ``RPGADSDIGGFGGLFDLAQAGFR`` by loading +the corresponding theoretical mass spectrum into the viewer using "Tools"->"Generate theoretical spectrum": .. image:: img/psm.png -From our output above, we notice that the second peptide spectrum match (PSM) -at 775.38 m/z for sequence ``RPGADSDIGGFGGLFDLAQAGFR`` was found with an error -tolerance of 2.25 ppm, therefore if we set the precursor mass tolerance to 4 -ppm (+/- 2ppm), we expect that we will not find the hit at 775.38 m/z any more: +From our output above, we notice that the second :term:`PSM` +at :math:`775.38` m/z for sequence ``RPGADSDIGGFGGLFDLAQAGFR`` was found with an error +tolerance of :math:`2.25\ ppm`, therefore if we set the precursor mass tolerance to :math:`4\ +ppm\ (\pm 2\ ppm)`, we expect that we will not find the hit at :math:`775.38` m/z any more: .. code-block:: python :linenos: diff --git a/docs/source/aasequences.rst b/docs/source/peptides_proteins.rst similarity index 88% rename from docs/source/aasequences.rst rename to docs/source/peptides_proteins.rst index c594f2fc0..720f0b90f 100644 --- a/docs/source/aasequences.rst +++ b/docs/source/peptides_proteins.rst @@ -46,13 +46,13 @@ The example below shows how amino acid sequences can be created and how basic ma Which prints the amino acid sequence as well as the result of concatenating two sequences or taking the suffix of a sequence. -We then compute the mass of the full peptide (``[M]``), the mass of the -peptide precursor (``[M+2H]2+``) and ``m/z`` value of the -peptide precursor (``[M+2H]2+``). +We then compute the mass of the full peptide (:chem:`[M]`), the mass of the +peptide precursor (:chem:`[M+2H]2+`) and m/z value of the +peptide precursor (:chem:`[M+2H]2+`). Note that, the mass of the peptide precursor is shifted by two protons that are now attached to the -molecules as charge carriers. (Detail: the proton mass of 1.007276 u is -slightly different from the mass of an uncharged hydrogen atom at 1.007825 u). -We can easily calculate the charged weight of a ``(M+2H)2+`` ion and compute *m/z* simply dividing by the charge. +molecules as charge carriers. (Detail: the proton mass of :math:`1.007276\ u` is +slightly different from the mass of an uncharged hydrogen atom at :math:`1.007825\ u`). +We can easily calculate the charged weight of a :chem:`(M+2H)2+` ion and compute m/z simply dividing by the charge. .. code-block:: output @@ -93,7 +93,7 @@ Which will print Arginine : 174.1116764466 The N- and C-Terminus as well as the residues themself can be modified. -The example below shows how to check fo such modifications. +The example below shows how to check for such modifications. .. code-block:: python :linenos: @@ -128,7 +128,7 @@ Which will print: Arginine : 174.1116764466 -Molecular formula +Molecular Formula ~~~~~~~~~~~~~~~~~ We can now combine our knowledge of :py:class:`~.AASequence` with what we learned in @@ -143,7 +143,7 @@ the amino acid sequence. But first, let's get the formula of peptide: print("Peptide", seq, "has molecular formula", seq_formula) -Isotope patterns +Isotope Patterns ~~~~~~~~~~~~~~~~ We now want to print the coarse (e.g., peaks only at nominal masses) distribution. @@ -162,7 +162,7 @@ We now want to print the coarse (e.g., peaks only at nominal masses) distributio For most applications in computational proteomics, the coarse isotope distribution is sufficient. But if we deal with very high resolution instruments, we still might want to calculate the isotopic fine structure. -We use the FineIsotopePatternGenerator in OpenMS to reveal these addtional peaks: +We use the :py:class:`~.FineIsotopePatternGenerator` in OpenMS to reveal these additional peaks: .. code-block:: python :linenos: @@ -177,7 +177,7 @@ We use the FineIsotopePatternGenerator in OpenMS to reveal these addtional peaks ) -And plot the very similar looking distributions using standard matplotlib functionality: +And plot the very similar looking distributions using standard ``matplotlib`` functionality: .. code-block:: python :linenos: @@ -217,7 +217,7 @@ And plot the very similar looking distributions using standard matplotlib functi .. image:: img/DFPIANGER_isoDistribution.png -Fragment ions +Fragment Ions ~~~~~~~~~~~~~ We can easily calculate different ion types for amino acid sequences: @@ -298,7 +298,7 @@ The above code outputs: Note there is a subtle difference between ``AASequence.fromString(".DFPIAM[+16]GER.")`` and ``AASequence.fromString(".DFPIAM[+15.9949]GER.")`` - while the former will try to -find the first modification matching to a mass difference of 16 +/- 0.5, the +find the first modification matching to a mass difference of :math:`16 \pm 0.5`, the latter will try to find the closest matching modification to the exact mass. The exact mass approach usually gives the intended results while the first approach may or may not. In all instances, it is better to use an exact description of the desired modification, such as UniMod, instead of mass differences. @@ -319,20 +319,20 @@ phosphorylation of the last arginine at its side chain: s = AASequence.fromString(".DFPIAMGER(Phospho).") print(s, s.hasCTerminalModification()) -Arbitrary/unknown amino acids (usually due to an unknown modification) can be -specified using tags preceded by X: "X[weight]". This indicates a new amino -acid ("X") with the specified weight, e.g. ``"RX[148.5]T"``. Note that this tag -does not alter the amino acids to the left (R) or right (T). Rather, X -represents an amino acid on its own. Be careful when converting such AASequence +Arbitrary / unknown amino acids (usually due to an unknown modification) can be +specified using tags preceded by :chem:`X`: :chem:`X[weight]`. This indicates a new amino +acid (":chem:`X`") with the specified weight, e.g. :chem:`RX[148.5]T`. Note that this tag +does not alter the amino acids to the left (:chem:`R`) or right (:chem:`T`). Rather, :chem:`X` +represents an amino acid on its own. Be careful when converting such :py:class:`~.AASequence` objects to an EmpiricalFormula using :py:meth:`~.AASequence.getFormula`, as tags will not be considered in this case (there exists no formula for them). However, they have an influence on :py:meth:`~.AASequence.getMonoWeight` and :py:meth:`~.AASequence.getAverageWeight`! -Proteins and FASTA files -************************ +Proteins and :term:`FASTA` Files +******************************** -Protein sequences, can be loaded from and stored in FASTA protein databases using :py:class:`~.FASTAFile`. -The example below shows how protein sequences can be stored in FASTA files and loaded back in pyOpenMS: +Protein sequences, can be loaded from and stored in :term:`FASTA` protein databases using :py:class:`~.FASTAFile`. +The example below shows how protein sequences can be stored in :term:`FASTA` files and loaded back in pyOpenMS: .. code-block:: python :linenos: diff --git a/docs/source/pyopenms_in_r.rst b/docs/source/pyopenms_in_r.rst index e0c8ed50c..b7c511f7c 100644 --- a/docs/source/pyopenms_in_r.rst +++ b/docs/source/pyopenms_in_r.rst @@ -1,14 +1,14 @@ pyOpenMS in R -=============== +===================== Currently, there are no native wrappers for the OpenMS library in R, however we can use the "reticulate" package in order to get access to the full functionality of pyOpenMS in the R programming language. -Install the "reticulate" R package +Install the "reticulate" R Package ********************************** -In order to use all pyopenms functionalities in R, we suggest to use the "reticulate" R package. +In order to use all pyOpenMS functionalities in R, we suggest to use the "reticulate" R package. A thorough documentation is available at: https://rstudio.github.io/reticulate/ @@ -17,7 +17,7 @@ A thorough documentation is available at: https://rstudio.github.io/reticulate/ install.packages("reticulate") -Installation of pyopenms is a requirement as well and it is necessary to make sure that R is using the same Python environment. +Installation of pyOpenMS is a requirement as well and it is necessary to make sure that R is using the same Python environment. In case R is having trouble to find the correct Python environment, you can set it by hand as in this example (using miniconda, you will have to adjust the file path to your system to make this work). You will need to do this before loading the "reticulate" library: @@ -34,10 +34,10 @@ Or after loading the "reticulate" library: library("reticulate") use_python("/usr/local/miniconda3/envs/py37/bin/python") -Import pyopenms in R -******************** +Import pyOpenMS in R +**************************** -After loading the "reticulate" library you should be able to import pyopenms into R +After loading the "reticulate" library you should be able to import pyOpenMS into R .. code-block:: R :linenos: @@ -45,10 +45,10 @@ After loading the "reticulate" library you should be able to import pyopenms int library(reticulate) ropenms=import("pyopenms", convert = FALSE) -This should now give you access to all of pyopenms in R. Importantly, the convert option +This should now give you access to all of pyOpenMS in R. Importantly, the convert option has to be set to FALSE, since type conversions such as 64bit integers will cause a problem. -You should now be able to interact with the OpenMS library and, for example, read and write mzML files: +You should now be able to interact with the OpenMS library and, for example, read and write :term:`mzML` files: .. code-block:: R :linenos: @@ -58,13 +58,13 @@ You should now be able to interact with the OpenMS library and, for example, rea exp = ropenms$MSExperiment() ropenms$MzMLFile()$store("testfile.mzML", exp) -which will create an empty mzML file called `testfile.mzML`. +which will create an empty :term:`mzML` file called `testfile.mzML`. Getting help ************ -Using the "reticulate" R package provides a way to access the pyopenms information -about the available functions and methods. We can inspect individual pyOpenMS objects +Using the "reticulate" R package provides a way to access the pyOpenMS information +about the available functions and methods. We can inspect individual pyOpenMS objects through the ``py_help`` function: .. code-block:: R @@ -129,8 +129,8 @@ Therefore in this case we need to use the ``reticulate::r_to_py()`` and ``reticu An example use case ******************* -Reading an mzML File -^^^^^^^^^^^^^^^^^^^^ +Reading an :term:`mzML` File +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pyOpenMS supports a variety of different files through the implementations in OpenMS. In order to read mass spectrometric data, we can download the `mzML @@ -243,7 +243,7 @@ Or visualize a particular ms2 spectrum: .. image:: img/R_ggplot_ms2.png -Alternatively, we could also have used ``apply`` to obtain the peak data, which +Alternatively, we could also have used ``apply`` to obtain the peak data, which is more idiomatic way of doing things for the R programming language: .. code-block:: R @@ -259,7 +259,7 @@ is more idiomatic way of doing things for the R programming language: Iteration ^^^^^^^^^ -Iterating over pyopenms objects is not equal to iterating over R vectors or +Iterating over pyOpenMS objects is not equal to iterating over R vectors or lists. Note that for many applications, there is a more efficient way to access data (such as :py:meth:`~.MSSpectrum.get_peaks` instead of iterating over individual peaks). diff --git a/docs/source/mzqc_export.rst b/docs/source/quality_control.rst similarity index 99% rename from docs/source/mzqc_export.rst rename to docs/source/quality_control.rst index 654b31ce2..4253e9d67 100644 --- a/docs/source/mzqc_export.rst +++ b/docs/source/quality_control.rst @@ -12,6 +12,7 @@ at least a :py:class:`~.MSExperiment`. Optionally, a :py:class:`~.FeatureMap` an proteomics and metabolomics quality metrics. .. code-block:: python + :linenos: from pyopenms import * diff --git a/docs/source/datastructures_quant.rst b/docs/source/quantitative_data.rst similarity index 85% rename from docs/source/datastructures_quant.rst rename to docs/source/quantitative_data.rst index 6ae9ff0f5..16134fef5 100644 --- a/docs/source/datastructures_quant.rst +++ b/docs/source/quantitative_data.rst @@ -1,13 +1,13 @@ Quantitative Data ================= -Feature -******* +features +************************** In OpenMS, information about quantitative data is stored in a so-called :py:class:`~.Feature` which we have previously discussed `here `_. Each -:py:class:`~.Feature` represents a region in RT and *m/z* space use for quantitative +:py:class:`~.Feature` represents a region in RT and m/z space use for quantitative analysis. .. code-block:: python @@ -28,7 +28,7 @@ analysis. masstrace.push_back(p) Usually, the quantitative features would be produced by a so-called -"FeatureFinder" algorithm, which we will discuss in the next chapter. The +:py:class:`~.FeatureFinder` algorithm, which we will discuss in the next chapter. The features can be stored in a :py:class:`~.FeatureMap` and written to disk. .. code-block:: python @@ -43,18 +43,18 @@ features can be stored in a :py:class:`~.FeatureMap` and written to disk. fm.push_back(feature) FeatureXMLFile().store("test.featureXML", fm) -Visualizing the resulting map in ``TOPPView`` allows detection of the two +Visualizing the resulting map in :term:`TOPPView` allows detection of the two features stored in the :py:class:`~.FeatureMap` with the visualization indicating charge -state, *m/z*, RT and other properties: +state, m/z, RT and other properties: .. image:: img/feature.png -Note that in this case only 2 features are present, but in a typical :term:`LC-MS/MS` +Note that in this case only two features are present, but in a typical :term:`LC-MS/MS` experiments, thousands of features are present. -FeatureMap -************ +:term:`Feature Maps` +********************************** The resulting :py:class:`~.FeatureMap` can be used in various ways to extract quantitative data directly and it supports direct iteration in Python: @@ -69,8 +69,8 @@ quantitative data directly and it supports direct iteration in Python: -ConsensusFeature -**************** +:term:`Consensus Features` +********************************************** Often :term:`LC-MS/MS` experiments are run to compare quantitative features across experiments. In OpenMS, linked features from individual experiments are @@ -103,7 +103,7 @@ represented by a :py:class:`~.ConsensusFeature` We have thus added two features from two individual maps (which have the unique identifier ``1`` and ``2``) to the :py:class:`~.ConsensusFeature`. -Next, we inspect the consensus feature, compute a "consensus" *m/z* across +Next, we inspect the :term:`consensus feature`, compute a "consensus" m/z across the two maps and output the two linked features: .. code-block:: python @@ -136,12 +136,12 @@ unique identifiers. Visualization of the resulting output file reveals a single :py:class:`~.ConsensusFeature` of size 2 that links to the two individual features at -their respective positions in RT and *m/z*: +their respective positions in RT and m/z: .. image:: img/consensus.png -ConsensusMap -************ +:term:`Consensus Maps` +************************************** The resulting :py:class:`~.ConsensusMap` can be used in various ways to extract quantitative data directly and it supports direct iteration in Python: diff --git a/docs/source/massql.rst b/docs/source/query_msexperiment_massql.rst similarity index 80% rename from docs/source/massql.rst rename to docs/source/query_msexperiment_massql.rst index 61022eb31..3c2c32116 100644 --- a/docs/source/massql.rst +++ b/docs/source/query_msexperiment_massql.rst @@ -1,15 +1,15 @@ -Query MSExperiment with MassQL -============================== +Query :py:class:`~.MSExperiment` with MassQL +============================================ -MassQL is a powerful, SQL-like query language for :term:`Mass spectrometry` data. +MassQL is a powerful, SQL-like query language for mass spectrometry data. For further information visit the `MassQL documentation `_. -MS data from a :py:class:`~.MSExperiment` can be exported to MS1 and MS2 dataframes, which can +MS data from a :py:class:`~.MSExperiment` can be exported to :term:`MS1` and :term:`MS2` dataframes, which can be queried directly with the ``massql`` module. **pyopenms.MSExperiment.get_massql_df()** - Exports data from MSExperiment to pandas DataFrames to be used with MassQL. + Exports data from :py:class:`~.MSExperiment` to pandas DataFrames to be used with MassQL. Both dataframes contain the columns: 'i': intensity of a peak @@ -20,26 +20,27 @@ be queried directly with the ``massql`` module. 'rt': retention time of the spectrum 'polarity': ion mode of the spectrum as integer value (positive: 1, negative: 2) - The MS2 dataframe contains additional columns: + The :term:`MS2` dataframe contains additional columns: 'precmz': mass to charge of the precursor ion - 'ms1scan': number of the corresponding MS1 spectrum + 'ms1scan': number of the corresponding :term:`MS1` spectrum 'charge': charge of the precursor ion **Returns:** ms1_df : **pandas.DataFrame** - peak data of MS1 spectra + peak data of :term:`MS1` spectra ms2_df : **pandas.DataFrame** - peak data of MS2 spectra with precursor information + peak data of :term:`MS2` spectra with precursor information **Example:** -Load an example file into a :py:class:`~.MSExperiment` and get the MS1 and MS2 data frames for a MassQL query. +Load an example file into a :py:class:`~.MSExperiment` and get the :term:`MS1` and :term:`MS2` data frames for a MassQL query. .. code-block:: python + :linenos: from pyopenms import * from massql import msql_engine @@ -58,6 +59,8 @@ Load an example file into a :py:class:`~.MSExperiment` and get the MS1 and MS2 d ms1_df, ms2_df = exp.get_massql_df() ms1_df.head() + + .. csv-table:: ms1_df.head() :widths: 2 20 20 20 20 20 20 20 :header: , i, i_norm, i_tic_norm, mz, scan, rt, polarity @@ -84,6 +87,7 @@ will read data from the given file name. results_df.head() + .. csv-table:: results_df.head() :widths: 2 20 20 20 20 20 :header: , scan, rt, mslevel, i, i_norm diff --git a/docs/source/file_handling.rst b/docs/source/reading_raw_ms_data.rst similarity index 89% rename from docs/source/file_handling.rst rename to docs/source/reading_raw_ms_data.rst index 53f3d2b0e..3ed59923a 100644 --- a/docs/source/file_handling.rst +++ b/docs/source/reading_raw_ms_data.rst @@ -1,11 +1,11 @@ -Reading Raw MS data -=================== +Reading Raw MS Data +=========================== -mzML files in memory -******************** +:term:`mzML` Files in Memory +**************************** -As discussed in the last section, the most straight forward way to load mass -spectrometric data is using the :py:class:`~.MzMLFile` class: +As discussed in the last section, the most straight forward way to load :term:`mass +spectrometry` data is using the :py:class:`~.MzMLFile` class: .. code-block:: python @@ -37,7 +37,7 @@ manipulate the spectra in the file for example as follows: exp.setSpectra(spec) -Which will only keep MS2 spectra in the :py:class:`~.MSExperiment`. We can then store the modified data structure on disk: +Which will only keep :term:`MS2` spectra in the :py:class:`~.MSExperiment`. We can then store the modified data structure on disk: .. code-block:: python @@ -62,8 +62,8 @@ Putting this together, a small filtering program would look like this: MzMLFile().store("filtered.mzML", exp) -indexed mzML files -****************** +Indexed :term:`mzML` Files +************************** Since pyOpenMS 2.4, you can open, read and inspect files that use the indexedMzML standard. This allows users to read MS data without loading all @@ -87,16 +87,16 @@ data into memory: Note that the :py:class:`~.OnDiscMSExperiment` allows users to access meta data through the :py:meth:`~.OnDiscMSExperiment.getMetaData` function, which allows easy selection and filtering on meta -data attributes (such as MS level, precursor *m/z*, retention time etc.) in -order to select spectra and chromatograms for analysis. Only once selection on +data attributes (such as MS level, precursor m/z, retention time etc.) in +order to select spectra and chromatograms for analysis. Only once selection on the meta data has been performed, will actual data be loaded into memory using the :py:meth:`~.OnDiscMSExperiment.getChromatogram` and :py:meth:`~.OnDiscMSExperiment.getSpectrum` functions. This approach is memory efficient in cases where computation should only occur on part of the data or the whole data may not fit into memory. -mzML files as streams -********************* +:term:`mzML` Files as Streams +***************************** In some instances it is impossible or inconvenient to load all data from an mzML file directly into memory. OpenMS offers streaming-based access to mass @@ -187,10 +187,10 @@ Note that this approach is memory efficient in cases where computation should only occur on part of the data or the whole data may not fit into memory. -cached mzML files -********************* +Cached :term:`mzML` Files +************************* -In addition, since pyOpenMS 2.4 the user can efficiently cache mzML files to disk which +In addition, since pyOpenMS 2.4 the user can efficiently cache :term:`mzML` files to disk which provides very fast access with minimal overhead in memory. Basically the data directly mapped into memory when requested. You can use this feature as follows: @@ -219,8 +219,8 @@ directly mapped into memory when requested. You can use this feature as follows: Note that the :py:class:`~.CachedmzML` allows users to access meta data through the :py:meth:`~.CachedmzML.getMetaData` function, which allows easy selection and filtering on meta -data attributes (such as MS level, precursor *m/z*, retention time etc.) in -order to select spectra and chromatograms for analysis. Only once selection on +data attributes (such as MS level, precursor m/z, retention time etc.) in +order to select spectra and chromatograms for analysis. Only once selection on the meta data has been performed, will actual data be loaded into memory using the :py:meth:`~.CachedmzML.getChromatogram` and :py:meth:`~.CachedmzML.getSpectrum` functions. diff --git a/docs/source/hyperscore.rst b/docs/source/scoring_spectra_hyperscore.rst similarity index 81% rename from docs/source/hyperscore.rst rename to docs/source/scoring_spectra_hyperscore.rst index a98be23ea..a7ee692ff 100644 --- a/docs/source/hyperscore.rst +++ b/docs/source/scoring_spectra_hyperscore.rst @@ -1,10 +1,10 @@ -Scoring spectra with HyperScore +Scoring Spectra with HyperScore =============================== In the chapter on spectrum alignment we showed how to determine matching peaks between theoretical and experimental spectra. For many use cases we might actually not be interested in obtaining the list of matched peaks but would like to have a simple, single score that indicates how "well" the two spectra matched. -The :py:class:`~.HyperScore` is a method to assign a spectrum match score to spectrum matches. +The :py:class:`~.HyperScore` is a method to assign a score to :term:`peptide-spectrum matches`. Background @@ -13,10 +13,10 @@ Background :py:class:`~.HyperScore` computes the (ln transformed) ``HyperScore`` of theoretical spectrum, calculated from a peptide/oligonucleotide sequence, with an experimental spectrum, -loaded from an mzML file. +loaded from an :term:`mzML` file. 1. the dot product of peak intensities between matching peaks in experimental and theoretical spectrum is calculated - 2. the ``HyperScore`` is calculated from the dot product by multiplying by factorials of matching b- and y-ions + 2. the :py:class:`~.HyperScore` is calculated from the dot product by multiplying by factorials of matching b- and y-ions .. code-block:: python @@ -26,7 +26,7 @@ loaded from an mzML file. gh = "https://raw.githubusercontent.com/OpenMS/pyopenms-docs/master" urlretrieve(gh + "/src/data/SimpleSearchEngine_1.mzML", "searchfile.mzML") -Generate a theoretical spectrum +Generate a Theoretical Spectrum ******************************* @@ -56,16 +56,16 @@ We now use the :py:class:`~.TheoreticalSpectrumGenerator` to generate a theoreti for peak in peaks: print(peak[0], "mz", peak[1], "int") -Comparing the theoretical spectrum and the experimental spectrum for +Comparing the spectrum and the experimental spectrum for ``RPGADSDIGGFGGLFDLAQAGFR`` we can easily see that the most abundant ions in the -spectrum are y8 (877.452 m/z), b10 (926.432), y9 (1024.522 m/z) and b13 -(1187.544 m/z). +spectrum are :chem:`y8` (:math:`877.452` m/z), :chem:`b10` (:math:`926.432`), :chem:`y9` +(:math:`1024.522` m/z) and :chem:`b13` (:math:`1187.544` m/z). -Getting a score +Getting a Score *************** We now run :py:class:`~.HyperScore` to compute the similarity of the theoretical spectrum -and the experimental spectrum and print the result +and the experimental spectrum and print the result .. code-block:: python diff --git a/docs/source/smoothing.rst b/docs/source/smoothing.rst index cc174a8b3..ef5c5cdc8 100644 --- a/docs/source/smoothing.rst +++ b/docs/source/smoothing.rst @@ -26,9 +26,9 @@ further analysis gf.filterExperiment(exp) MzMLFile().store("tutorial.smoothed.mzML", exp) -We can now load our data into TOPPView to observe the effect of the smoothing, +We can now load our data into :term:`TOPPView` to observe the effect of the smoothing, which becomes apparent when we overlay the two files (drag onto each other) and -then zoom into a given mass range using Ctrl-G and select 4030 to 4045: +then zoom into a given mass range using Ctrl-G and select :math:`4030` to :math:`4045`: .. image:: img/smoothing.png diff --git a/docs/source/spectrumalignment.rst b/docs/source/spectrum_alignment.rst similarity index 86% rename from docs/source/spectrumalignment.rst rename to docs/source/spectrum_alignment.rst index 85b0920c0..7ba932476 100644 --- a/docs/source/spectrumalignment.rst +++ b/docs/source/spectrum_alignment.rst @@ -1,9 +1,9 @@ -Spectrum alignment +Spectrum Alignment ================== -OpenMS provides several ways to find matching peaks between two spectra. -The most basic one SpectrumAlignment returns a list of matching peak indices between a query and target spectrum. -In this example, we take an observed (measured) spectrum and align a theoretical spectrum to it. +OpenMS provides several ways to find matching peaks between two mass spectra. +The most basic one :py:class:`~.SpectrumAlignment` returns a list of matching peak indices between a query and target mass spectrum. +In this example, we take an observed (measured) mass spectrum and align a theoretical mass spectrum to it. First we load a (chemically modified) peptide: @@ -27,7 +27,7 @@ First we load a (chemically modified) peptide: observed_spectrum = spectra[0] -Now we generate the theoretical spectrum of that peptide: +Now we generate the theoretical mass spectrum of that peptide: .. code-block:: python :linenos: @@ -42,11 +42,12 @@ Now we generate the theoretical spectrum of that peptide: peptide = AASequence.fromString("YIC(Carbamidomethyl)DNQDTISSK") tsg.getSpectrum(theo_spectrum, peptide, 1, 2) -Now we can plot the observed and theoretical spectrum as a mirror plot: +Now we can plot the observed and theoretical mass spectrum as a mirror plot: .. code-block:: python :linenos: + import matplotlib.pyplot as plt from pyopenms.plotting import mirror_plot_spectrum import matplotlib.pyplot as plt @@ -61,7 +62,7 @@ which produces .. image:: img/spec_alignment_1.png -Now we want to find matching peaks between observed and theoretical spectrum. +Now we want to find matching peaks between observed and theoretical mass spectrum. .. code-block:: python :linenos: @@ -117,11 +118,12 @@ The alignment contains a list of matched peak indices. We can simply inspect mat y10++ 2 584.251 584.412 y11++ 2 640.793 640.954 -The mirror plot can also be used to visualize the aligned spectrum: +The mirror plot can also be used to visualize the aligned mass spectrum: .. code-block:: python :linenos: + import matplotlib.pyplot as plt from pyopenms.plotting import mirror_plot_spectrum import matplotlib.pyplot as plt diff --git a/docs/source/normalization.rst b/docs/source/spectrum_normalization.rst similarity index 68% rename from docs/source/normalization.rst rename to docs/source/spectrum_normalization.rst index c3be0da0d..8783abfdb 100644 --- a/docs/source/normalization.rst +++ b/docs/source/spectrum_normalization.rst @@ -1,11 +1,12 @@ -Spectrum normalization +Spectrum Normalization ====================== -Another very basic spectrum processing step is normalization by base peak intensity (the maximum intensity of a spectrum). +Another very basic mass spectrum processing step is normalization by base peak intensity (the maximum intensity of a mass spectrum). Let's first load the raw data. .. code-block:: python + :linenos: from urllib.request import urlretrieve from pyopenms import * @@ -28,6 +29,7 @@ Let's first load the raw data. Now we apply the normalization. .. code-block:: python + :linenos: normalizer = Normalizer() param = normalizer.getParameters() @@ -42,6 +44,6 @@ Now we apply the normalization. ) -Another way of normalizing is by TIC (total ion count) of the spectrum, which scales intensities -so they add up to 1.0 in each spectrum. -Try it out for yourself by setting: param.setValue("method", "to_TIC"). +Another way of normalizing is by TIC (total ion count) of the mass spectrum, which scales intensities +so they add up to :math:`1.0` in each mass spectrum. +Try it out for yourself by setting: ``param.setValue("method", "to_TIC")``. diff --git a/docs/source/support.rst b/docs/source/support.rst index 57bfe28dc..40e2b7206 100644 --- a/docs/source/support.rst +++ b/docs/source/support.rst @@ -1,7 +1,7 @@ Support ======= -Feature requests +Feature Requests **************** pyOpenMS is an evolving project. We are happy to learn about missing features you would like to @@ -13,7 +13,7 @@ Feel free to open an issue on `GitHub issue `_ or the `OpenMS Gitter chat channel `_. diff --git a/docs/source/metabolomics_preprocessing.rst b/docs/source/untargeted_metabolomics_preprocessing.rst similarity index 90% rename from docs/source/metabolomics_preprocessing.rst rename to docs/source/untargeted_metabolomics_preprocessing.rst index dff0aa7a7..641f882f0 100644 --- a/docs/source/metabolomics_preprocessing.rst +++ b/docs/source/untargeted_metabolomics_preprocessing.rst @@ -1,13 +1,13 @@ Untargeted Metabolomics Pre-Processing ====================================== -The universal workflow for untargeted metabolomics always consists of feature detection in the individual MS sample -files and their linkage to consensus features with common m/z and retention time values. -In addition, there are optional steps such as adduct detection and annotation of features with associated MS2 spectra. +The universal workflow for untargeted :terM:`metabolomics` always consists of :terM:`feature` detection in the individual MS sample +files and their linkage to :term:`consensus features` with common m/z and retention time values. +In addition, there are optional steps such as adduct detection and annotation of :terM:`features` with associated :term:`MS2` spectra. .. image:: img/metabolomics-preprocessing.png -First, download two example ``mzML`` files. +First, download two example :term:`mzML` files. .. code-block:: python @@ -17,7 +17,7 @@ First, download two example ``mzML`` files. urlretrieve(gh + "/src/data/Metabolomics_1.mzML", "Metabolomics_1.mzML") urlretrieve(gh + "/src/data/Metabolomics_2.mzML", "Metabolomics_2.mzML") -For each ``mzML`` file do mass trace, elution peak and feature detection. +For each :term:`mzML` file do mass trace, elution peak and features detection. .. code-block:: python @@ -75,7 +75,7 @@ For each ``mzML`` file do mass trace, elution peak and feature detection. ) # Sets the file path to the primary MS run (usually the mzML file) feature_maps.append(feature_map) -Align feature retention times based on the feature map with the highest number of features (reference map). +Align features retention times based on the :term:`feature map` with the highest number of features (reference map). .. code-block:: python :linenos: @@ -105,7 +105,7 @@ Align feature retention times based on the feature map with the highest number o transformer = MapAlignmentTransformer() transformer.transformRetentionTimes(feature_map, trafo, True) -Align ``mzML`` files aligment based on :py:class:`~.FeatureMap` alignment (optional, only for GNPS). +Align :term:`mzML` files aligment based on :py:class:`~.FeatureMap` alignment (optional, only for GNPS). .. code-block:: python :linenos: @@ -125,7 +125,7 @@ Align ``mzML`` files aligment based on :py:class:`~.FeatureMap` alignment (optio MzMLFile().store(file[:-5] + "_aligned.mzML", exp) mzML_files = [file[:-5] + "_aligned.mzML" for file in mzML_files] -Map MS2 spectra to features as :py:class:`~.PeptideIdentification` objects (optional, only for GNPS). +Map :term:`MS2` spectra to features as :py:class:`~.PeptideIdentification` objects (optional, only for GNPS). .. code-block:: python :linenos: @@ -226,7 +226,7 @@ Link features in a :py:class:`~.ConsensusMap`. consensus_map.setUniqueIds() ConsensusXMLFile().store("FeatureMatrix.consensusXML", consensus_map) -To get a final feature matrix in a table format, export the consensus features in a ``pandas DataFrame``. +To get a final feature matrix in a table format, export the :term:`:consensus features` in a ``pandas DataFrame``. .. code-block:: python :linenos: diff --git a/docs/source/wrap_classes.rst b/docs/source/wrapping_workflows_new_classes.rst similarity index 96% rename from docs/source/wrap_classes.rst rename to docs/source/wrapping_workflows_new_classes.rst index 310a88c76..d09761e3d 100644 --- a/docs/source/wrap_classes.rst +++ b/docs/source/wrapping_workflows_new_classes.rst @@ -1,8 +1,8 @@ -Wrapping Workflow and wrapping new Classes -****************************************** +Wrapping Workflows and New Classes +********************************** -How pyOpenMS wraps Python classes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +How pyOpenMS Wraps Python Classes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ General concept of how the wrapping is done (all files are in ``src/pyOpenMS/``): @@ -25,18 +25,18 @@ Maintaining existing wrappers: If the C++ API is changed, then pyOpenMS will not build any more. Thus, find the corresponding file in the ``pyOpenMS/pxds/`` folder and adjust the function declaration accordingly. -How to wrap new methods in existing classes +How to Wrap New Methods in Existing Classes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Lets say you have written a new method for an existing OpenMS class and you would like to expose this method to pyOpenMS. First, identify the correct ``.pxd`` file in the ``src/pyOpenMS/pxds`` folder (for example for -``Adduct`` that would be `Adduct.pxd +:py:class:`~.Adduct` that would be `Adduct.pxd `_). Open it and add your new function *with the correct indentation*: - Place the full function declaration into the file (indented as the other functions) -- Check whether you are using any classes that are not yet imported, if so add a corresponding ``cimport`` statement to the top of the file. E.g. if your method is using using ``MSExperiment``, then add ``from MSExerpiment cimport *`` to the top (note its cimport, not import). +- Check whether you are using any classes that are not yet imported, if so add a corresponding ``cimport`` statement to the top of the file. E.g. if your method is using using :py:class:`~.MSExperiment`, then add ``from MSExerpiment cimport *`` to the top (note its cimport, not import). - Remove any qualifiers (e.g. `const`) from the function signature and add `nogil except +` to the end of the signature - Ex: ``void setType(Int a);`` becomes ``void setType(Int a) nogil except +`` @@ -61,12 +61,12 @@ Open it and add your new function *with the correct indentation*: See the next section for a SimpleExample_ and a more AdvancedExample_ of a wrapped class with several functions. -How to wrap new classes +How to Wrap New Classes ^^^^^^^^^^^^^^^^^^^^^^^ .. _SimpleExample: -A simple example +A Simple Example ---------------- To wrap a new OpenMS class: Create a new ".pxd" file in the folder ``./pxds``. As @@ -194,7 +194,7 @@ implementations ``AbstractBaseClassImpl1`` and ``AbstractBaseClassImpl2``. Then, the function needs to declared and overloaded with both implementations as arguments as shown above. -An example with handwritten addon code +An Example with Handwritten Addon Code -------------------------------------- A more complex examples requires some hand-written wrapper code @@ -273,7 +273,7 @@ to generate the required iterators and process the container efficiently. .. _Limitations Section: -Considerations and limitations +Considerations and Limitations ------------------------------ Further considerations and limitations: @@ -311,7 +311,7 @@ These hints can be given to autowrap functions (also check the autowrap document - ``wrap-upper-limit:size()`` (see MSSpectrum.pxd) -Wrapping code yourself in ./addons +Wrapping Code Yourself in ./addons ---------------------------------- Not all code can be wrapped automatically (yet). Place a file with the same (!)