Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds target/decoy information. More...

#include <OpenMS/ANALYSIS/ID/PeptideIndexing.h>

Inheritance diagram for PeptideIndexing:

[legend]

Collaboration diagram for PeptideIndexing:

[legend]

Public Types
enum	ExitCodes { EXECUTION_OK , DATABASE_EMPTY , PEPTIDE_IDS_EMPTY , ILLEGAL_PARAMETERS , UNEXPECTED_RESULT }
	Exit codes. More...

enum class	Unmatched { IS_ERROR , WARN , REMOVE , SIZE_OF_UNMATCHED }
	Action to take when peptide hits could not be matched. More...

enum class	MissingDecoy { IS_ERROR , WARN , SILENT , SIZE_OF_MISSING_DECOY }

Public Types inherited from ProgressLogger
enum	LogType { CMD , GUI , NONE }
	Possible log types. More...

Public Member Functions
	PeptideIndexing ()
	Default constructor. More...

	~PeptideIndexing () override
	Default destructor. More...

ExitCodes	run (std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
	forward for old interface and pyOpenMS; use other run() methods for more control More...

ExitCodes	run (FASTAContainer< TFI_File > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
	Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/decoy hits. More...

ExitCodes	run (FASTAContainer< TFI_Vector > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
	Same as run() with TFI_File, but for proteins which are already in memory. More...

const String &	getDecoyString () const
	Which string is used to determine if a protein is a decoy or not. More...

bool	isPrefix () const
	Is the decoy string position a prefix or suffix? More...

Public Member Functions inherited from DefaultParamHandler
	DefaultParamHandler (const String &name)
	Constructor with name that is displayed in error messages. More...

	DefaultParamHandler (const DefaultParamHandler &rhs)
	Copy constructor. More...

virtual	~DefaultParamHandler ()
	Destructor. More...

DefaultParamHandler &	operator= (const DefaultParamHandler &rhs)
	Assignment operator. More...

virtual bool	operator== (const DefaultParamHandler &rhs) const
	Equality operator. More...

void	setParameters (const Param &param)
	Sets the parameters. More...

const Param &	getParameters () const
	Non-mutable access to the parameters. More...

const Param &	getDefaults () const
	Non-mutable access to the default parameters. More...

const String &	getName () const
	Non-mutable access to the name. More...

void	setName (const String &name)
	Mutable access to the name. More...

const std::vector< String > &	getSubsections () const
	Non-mutable access to the registered subsections. More...

Public Member Functions inherited from ProgressLogger
	ProgressLogger ()
	Constructor. More...

virtual	~ProgressLogger ()
	Destructor. More...

	ProgressLogger (const ProgressLogger &other)
	Copy constructor. More...

ProgressLogger &	operator= (const ProgressLogger &other)
	Assignment Operator. More...

void	setLogType (LogType type) const
	Sets the progress log that should be used. The default type is NONE! More...

LogType	getLogType () const
	Returns the type of progress log being used. More...

void	setLogger (ProgressLoggerImpl *logger)
	Sets the logger to be used for progress logging. More...

void	startProgress (SignedSize begin, SignedSize end, const String &label) const
	Initializes the progress display. More...

void	setProgress (SignedSize value) const
	Sets the current progress. More...

void	endProgress (UInt64 bytes_processed=0) const

void	nextProgress () const
	increment progress by 1 (according to range begin-end) More...

Static Public Attributes
static char const *const	AUTO_MODE
	name of enzyme/specificity which signals that the enzyme/specificity should be taken from meta information More...

static const std::array< std::string,(Size) Unmatched::SIZE_OF_UNMATCHED >	names_of_unmatched

static const std::array< std::string,(Size) MissingDecoy::SIZE_OF_MISSING_DECOY >	names_of_missing_decoy

Protected Member Functions
void	updateMembers_ () override
	This method is used to update extra member variables at the end of the setParameters() method. More...

template<typename T >
ExitCodes	run_ (FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)

Protected Member Functions inherited from DefaultParamHandler
void	defaultsToParam_ ()
	Updates the parameters after the defaults have been set in the constructor. More...

Protected Attributes
String	decoy_string_ {}

bool	prefix_ { false }

MissingDecoy	missing_decoy_action_ = MissingDecoy::IS_ERROR

String	enzyme_name_ {}

String	enzyme_specificity_ {}

bool	write_protein_sequence_ { false }

bool	write_protein_description_ { false }

bool	keep_unreferenced_proteins_ { false }

Unmatched	unmatched_action_ = Unmatched::IS_ERROR

bool	IL_equivalent_ { false }

bool	allow_nterm_protein_cleavage_ { true }

Int	aaa_max_ {0}

Int	mm_max_ {0}

Protected Attributes inherited from DefaultParamHandler
Param	param_
	Container for current parameters. More...

Param	defaults_
	Container for default parameters. This member should be filled in the constructor of derived classes! More...

std::vector< String >	subsections_
	Container for registered subsections. This member should be filled in the constructor of derived classes! More...

String	error_name_
	Name that is displayed in error messages during the parameter checking. More...

bool	check_defaults_
	If this member is set to false no checking if parameters in done;. More...

bool	warn_empty_defaults_
	If this member is set to false no warning is emitted when defaults are empty;. More...

Protected Attributes inherited from ProgressLogger
LogType	type_

time_t	last_invoke_

ProgressLoggerImpl *	current_logger_

Additional Inherited Members
Static Public Member Functions inherited from DefaultParamHandler
static void	writeParametersToMetaValues (const Param &write_this, MetaInfoInterface &write_here, const String &key_prefix="")
	Writes all parameters to meta values. More...

Static Protected Attributes inherited from ProgressLogger
static int	recursion_depth_

Detailed Description

Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds target/decoy information.

All peptide and protein hits are annotated with target/decoy information, using the meta value "target_decoy". For proteins the possible values are "target" and "decoy", depending on whether the protein accession contains the decoy pattern (parameter decoy_string) as a suffix or prefix, respectively (see parameter prefix). For peptides, the possible values are "target", "decoy" and "target+decoy", depending on whether the peptide sequence is found only in target proteins, only in decoy proteins, or in both. The target/decoy information is crucial for the FalseDiscoveryRate tool. (For FDR calculations, "target+decoy" peptide hits count as target hits.)

Note: Make sure that your protein names in the database contain a correctly formatted decoy string. This can be ensured by using DecoyDatabase. If the decoy identifier is not recognized successfully, all proteins will be assumed to stem from the target-part of the query.
E.g., "sw|P33354_DECOY|YEHR_ECOLI Uncharacterized lipop..." is invalid, since the tool has no knowledge of how SwissProt entries are build up. A correct identifier could be "DECOY_sw|P33354|YEHR_ECOLI Uncharacterized li ..." or "sw|P33354|YEHR_ECOLI_DECOY Uncharacterized li", depending on whether you are using prefix or suffix annotation.

Some helpful target/decoy statistics will be reported when done.

By default this tool will fail if an unmatched peptide occurs, i.e. if the database does not contain the corresponding protein. You can force it to return successfully in this case by setting '-unmatched_action' to accept or even remove those hits.

Search engines (such as X!Tandem) will replace ambiguous amino acids ('B', 'J', 'Z' and 'X') in the protein database with unambiguous amino acids in the reported peptides, e.g. exchange 'X' with 'H'. This will cause such peptides to not be found by exactly matching their sequences to the protein database. However, we can recover these cases by using tolerant search for ambiguous amino acids in the protein sequence. This is done by default with up to three amino acids per peptide hit. If you only want exact matches, set aaa_max to zero (but expect that unmatched peptides might occur)!

Leucine/Isoleucine: Further complications can arise due to the presence of the isobaric amino acids isoleucine ('I') and leucine ('L') in protein sequences. Since the two have the exact same chemical composition and mass, they generally cannot be distinguished by mass spectrometry. If a peptide containing 'I' was reported as a match for a spectrum, a peptide containing 'L' instead would be an equally good match (and vice versa). To account for this inherent ambiguity, setting the flag IL_equivalent causes 'I' and 'L' to be considered as indistinguishable.
For example, if the sequence "PEPTIDE" (matching "Protein1") was identified as a search hit, but the database additionally contained "PEPTLDE" (matching "Protein2"), running PeptideIndexer with the IL_equivalent option would report both "Protein1" and "Protein2" as accessions for "PEPTIDE". (This is independent of ambiguous matching via aaa_max.) Additionally, setting this flag will convert all 'J's in any protein sequence to 'I'. This way, no tolerant search is required for 'J' (but is still possible for all the other ambiguous amino acids). If write_protein_sequences is requested and IL_equivalent is set as well, both the I/L-version and unmodified protein sequences need to be stored internally. This requires some extra memory, roughly equivalent to the size of the FASTA database file itself.

Enzyme specificity: Once a peptide sequence is found in a protein sequence, this does not imply that the hit is valid! This is where enzyme specificity comes into play. By default, the enzyme and the specificity used during search is derived from metadata in the idXML files ('auto' setting).

We make two exceptions to any specificity constraints: 1) for peptides starting at the second or third position of a protein are still considered N-terminally specific, since the residues can be cleaved off in vivo; X!Tandem reports these peptides. For example, the two peptides ABAR and LABAR would both match a protein starting with MLABAR. 2) adventitious cleavage at Asp|Pro (Aspartate/D | Proline/P) is allowed for all enzymes (as supported by X!Tandem), i.e. counts as a proper cleavage site (see http://www.thegpm.org/tandem/release.html).

You can relax the requirements further by choosing semi-tryptic (only one of two "internal" termini must match requirements) or none (essentially allowing all hits, no matter their context). These settings should not be used (due to high risk of reporting false positives), unless the search engine was instructed to search peptides in the same way (but then the default 'auto' setting will do the correct thing).

X!Tandem treats any occurrence of 'X' as stop codon (and thus as cleavage site). The resulting peptide will be non- or semi-tryptic. Those hits will not be matched and need to be removed using '-unmatched_action' (do not use termini specificity to cheat around it! It adds more false hits!).

The FASTA file should not contain duplicate protein accessions (since accessions are not validated) if a correct unique-matching annotation is important (target/decoy annotation is still correct).

Threading: This tool support multiple threads (threads option) to speed up computation, at the cost of little extra memory.

Member Enumeration Documentation

◆ ExitCodes

enum ExitCodes

Exit codes.

Enumerator
EXECUTION_OK
DATABASE_EMPTY
PEPTIDE_IDS_EMPTY
ILLEGAL_PARAMETERS
UNEXPECTED_RESULT

◆ MissingDecoy

enum MissingDecoy

strong

Enumerator
IS_ERROR
WARN
SILENT
SIZE_OF_MISSING_DECOY

◆ Unmatched

enum Unmatched

strong

Action to take when peptide hits could not be matched.

Enumerator
IS_ERROR	throws an error (and returns no results)
WARN	skips annotation with target/decoy but returns with 'success'
REMOVE	removes unmatched hits entirely and returns with 'success'
SIZE_OF_UNMATCHED

Constructor & Destructor Documentation

◆ PeptideIndexing()

PeptideIndexing ( )

Default constructor.

◆ ~PeptideIndexing()

~PeptideIndexing ( )

override

Default destructor.

Member Function Documentation

◆ getDecoyString()

const String& getDecoyString ( ) const

Which string is used to determine if a protein is a decoy or not.

◆ isPrefix()

bool isPrefix ( ) const

Is the decoy string position a prefix or suffix?

◆ run() [1/3]

ExitCodes run	(	FASTAContainer< TFI_File > &	proteins,
		std::vector< ProteinIdentification > &	prot_ids,
		std::vector< PeptideIdentification > &	pep_ids
	)

Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/decoy hits.

Template parameter 'T' can be either TFI_File or TFI_Vector. If the data is already available, use TFI_Vector and pass the vector. If the data is still in a FASTA file and its not needed afterwards for additional processing, use TFI_File and pass the filename.

PeptideIndexer refreshes target/decoy information and mapping of peptides to proteins. The target/decoy information is crucial for the FalseDiscoveryRate tool. (For FDR calculations, "target+decoy" peptide hits count as target hits.)

PeptideIndexer allows for ambiguous amino acids (B|J|Z|X) in the protein database, but not in the peptide sequences. For the latter only I/L can be treated as equivalent (see 'IL_equivalent' flag), but 'J' is not allowed.

Enzyme cutting rules and partial specificity can be specified.

Resulting protein hits appear in the order of the FASTA file, except for orphaned proteins, which will appear first with an empty target_decoy metavalue. Duplicate protein accessions & sequences will not raise a warning, but create multiple hits (PeptideIndexer scans over the FASTA file once for efficiency reasons, and thus might not see all accessions & sequences at once).

All peptide and protein hits are annotated with target/decoy information, using the meta value "target_decoy". For proteins the possible values are "target" and "decoy", depending on whether the protein accession contains the decoy pattern (parameter decoy_string) as a suffix or prefix, respectively (see parameter prefix).

Peptide hits are annotated with metavalue 'protein_references', and if matched to at least one protein also with metavalue 'target_decoy'. The possible values for 'target_decoy' are "target", "decoy" and "target+decoy", depending on whether the peptide sequence is found only in target proteins, only in decoy proteins, or in both. The metavalue is not present, if the peptide is unmatched.

Runtime: PeptideIndexer is usually very fast (loading and storing the data takes the most time) and search speed can be further improved (linearly), but using more threads. Avoid allowing too many (>=4) ambiguous amino acids if your database contains long stretches of 'X' (exponential search space).

Parameters

proteins	A list of proteins – either read piecewise from a FASTA file or as existing vector of FASTAEntries.
prot_ids	Resulting protein identifications associated to pep_ids (will be re-written completely)
pep_ids	Peptide identifications which should be search within `proteins` and then linked to `prot_ids`

Returns: Exit status codes.

◆ run() [2/3]

ExitCodes run	(	FASTAContainer< TFI_Vector > &	proteins,
		std::vector< ProteinIdentification > &	prot_ids,
		std::vector< PeptideIdentification > &	pep_ids
	)

Same as run() with TFI_File, but for proteins which are already in memory.

◆ run() [3/3]

ExitCodes run	(	std::vector< FASTAFile::FASTAEntry > &	proteins,
		std::vector< ProteinIdentification > &	prot_ids,
		std::vector< PeptideIdentification > &	pep_ids
	)

forward for old interface and pyOpenMS; use other run() methods for more control

◆ run_()

ExitCodes run_	(	FASTAContainer< T > &	proteins,
		std::vector< ProteinIdentification > &	prot_ids,
		std::vector< PeptideIdentification > &	pep_ids
	)

protected

◆ updateMembers_()

void updateMembers_ ( )

overrideprotectedvirtual

This method is used to update extra member variables at the end of the setParameters() method.

Also call it at the end of the derived classes' copy constructor and assignment operator.

The default implementation is empty.

Reimplemented from DefaultParamHandler.

Member Data Documentation

◆ aaa_max_

Int aaa_max_ {0}

protected

◆ allow_nterm_protein_cleavage_

bool allow_nterm_protein_cleavage_ { true }

protected

◆ AUTO_MODE

char const* const AUTO_MODE

static

name of enzyme/specificity which signals that the enzyme/specificity should be taken from meta information

◆ decoy_string_

String decoy_string_ {}

protected

◆ enzyme_name_

String enzyme_name_ {}

protected

◆ enzyme_specificity_

String enzyme_specificity_ {}

protected

◆ IL_equivalent_

bool IL_equivalent_ { false }

protected

◆ keep_unreferenced_proteins_

bool keep_unreferenced_proteins_ { false }

protected

◆ missing_decoy_action_

MissingDecoy missing_decoy_action_ = MissingDecoy::IS_ERROR

protected

◆ mm_max_

Int mm_max_ {0}

protected

◆ names_of_missing_decoy

const std::array<std::string, (Size)MissingDecoy::SIZE_OF_MISSING_DECOY> names_of_missing_decoy

static

◆ names_of_unmatched

const std::array<std::string, (Size)Unmatched::SIZE_OF_UNMATCHED> names_of_unmatched

static

◆ prefix_

bool prefix_ { false }

protected

◆ unmatched_action_

Unmatched unmatched_action_ = Unmatched::IS_ERROR

protected

◆ write_protein_description_

bool write_protein_description_ { false }

protected

◆ write_protein_sequence_

bool write_protein_sequence_ { false }

protected

Public Types

Public Member Functions

Static Public Attributes

Protected Member Functions

Protected Attributes

Additional Inherited Members

Detailed Description

Member Enumeration Documentation

◆ ExitCodes

◆ MissingDecoy

◆ Unmatched

Constructor & Destructor Documentation

◆ PeptideIndexing()

◆ ~PeptideIndexing()

Member Function Documentation

◆ getDecoyString()

◆ isPrefix()

◆ run() [1/3]

◆ run() [2/3]

◆ run() [3/3]

◆ run_()

◆ updateMembers_()

Member Data Documentation

◆ aaa_max_

◆ allow_nterm_protein_cleavage_

◆ AUTO_MODE

◆ decoy_string_

◆ enzyme_name_

◆ enzyme_specificity_

◆ IL_equivalent_

◆ keep_unreferenced_proteins_

◆ missing_decoy_action_

◆ mm_max_

◆ names_of_missing_decoy

◆ names_of_unmatched

◆ prefix_

◆ unmatched_action_

◆ write_protein_description_

◆ write_protein_sequence_