@PHDTHESIS{Errity2010,
AUTHOR = {A. Errity},
TITLE = {Exploring the dimensionality of speech using manifold learning and
dimensionality reduction methods},
SCHOOL = {Dublin City University},
YEAR = {2010},
ABSTRACT = {Many previous investigations have indicated that speech data has inherent
low-dimensional structure and that it may be possible to efficiently
represent speech using only a small number of parameters. This view
is motivated by the fact that articulatory movement is limited by
physiological constraints and thus the speech production apparatus
has only limited degrees of freedom. Also, the set of sounds used
in human spoken communication is only a small subset of all producible
sounds. A number of dimensionality reduction methods capable of discovering
such underlying structure have previously been applied to speech.
However, if speech lies on a manifold nonlinearly embedded in high-dimensional
space, as has been proposed in the past, classic linear dimensionality
reduction methods would be unable to discover this embedding. In
this dissertation a number of manifold learning, also referred to
as nonlinear dimensionality reduction, methods are applied to speech
to explore the possibility of underlying nonlinear manifold structure.
This dissertation describes a number of existing manifold learning
methods and details the application of these methods to high-dimensional
feature representations of speech data. Representations derived from
the conventional magnitude spectrum and less widely used phase spectrum
are investigated. The manifold learning methods used in this study
are locally linear embedding, Isomap, and Laplacian eigenmaps. The
classic linear method, principal component analysis (PCA), is also
applied to facilitate the comparison of linear and nonlinear methods.
The resulting low-dimensional representations are analysed through
visualisation, phone recognition, and speaker recognition experiments.
The recognition experiments are used as a means of evaluating how
much meaningful discriminatory information is contained in the low-dimensional
representations produced by each method. These experiments also serve
to display the potential value of these methods in speech processing
applications.
The manifold learning methods are shown to be capable of producing
meaningful lowdimensional representations of speech data suggesting
speech has low-dimensional manifold structure. In general, these
methods are found to outperform PCA in low dimensions, indicating
that speech may lie on a manifold nonlinearly embedded in high-dimensional
space. Phone classification experiments show that Isomap can offer
improvements over standard features and PCA-transformed features.
Investigation of magnitude and phase spectrum representations found
both to have similar low-dimensional structure and confirm that the
phase spectrum contains useful information for phone discrimination.
Results indicate that combining magnitude and phase spectrum information
yields improvements in phone classification tasks. A method to combine
magnitude and phase spectrum features for increased phone classification
accuracy without large increases in feature dimensionality is also
described.},
OWNER = {ame},
TIMESTAMP = {2010.04.06},
URL = {http://doras.dcu.ie/15142/}
}
@INPROCEEDINGS{Errity2009,
AUTHOR = {A. Errity and J. McKenna},
TITLE = {A Comparison of Linear and Nonlinear Dimensionality Reduction Methods
Applied to Synthetic Speech},
BOOKTITLE = INTERSPEECH2009,
YEAR = {2009},
PAGES = {1095--1098},
ADDRESS = {Brighton, UK},
MONTH = {September},
ABSTRACT = {In this study a number of linear and nonlinear dimensionality reduction
methods are applied to high dimensional representations of synthetic
speech to produce corresponding low dimensional embeddings. Several
important characteristics of the synthetic speech, such as formant
frequencies and f0, are known and controllable prior to dimensionality
reduction. The degree to which these characteristics are retained
after dimensionality reduction is examined in visualisation and classification
experiments. Results of these experiments indicate that each method
is capable of discovering meaningful low dimensional representations
of synthetic speech and that the nonlinear methods may outperform
linear methods in some cases.},
OWNER = {ame},
TIMESTAMP = {2009.06.20}
}
@INPROCEEDINGS{Errity2007a,
AUTHOR = {A. Errity and J. McKenna},
TITLE = {A comparative study of linear and nonlinear dimensionality reduction
for speaker identification},
BOOKTITLE = {Proc. of the 15th Int. Conf. on Digital Signal Processing (DSP)},
YEAR = {2007},
PAGES = {587--590},
ADDRESS = {Cardiff, Wales},
MONTH = {July},
ABSTRACT = {In this paper we apply linear and nonlinear dimensionality reduction
methods to speech produced by a number of different speakers in an
effort to yield low dimensional features capable of discriminating
between speakers. The classical linear dimensionality reduction method,
principal component analysis ({PCA}), and the nonlinear manifold
learning method, {I}somap, are investigated. The resulting features
are evaluated in {GMM}-based speaker identification experiments and
compared to conventional cepstral features. {I}somap is shown to
give the highest accuracy for very low dimensions, outperforming
{MFCC}s and {PCA} transformed features. Isomap is shown to be useful
for visualisation of speaker clusters. For higher dimensions, speaker
identification results indicate that features resulting from {PCA}
offer improvements over conventional {MFCC}s.},
OWNER = {aerrity},
TIMESTAMP = {2007.04.24},
URL = {http://ieeexplore.ieee.org/iel5/4288490/4288491/04288650.pdf}
}
@INPROCEEDINGS{Errity2006,
AUTHOR = {A. Errity and J. McKenna},
TITLE = {An Investigation of Manifold Learning for Speech Analysis},
BOOKTITLE = {Proc. of the Int. Conf. on Spoken Language Processing (Interspeech
2006 - ICSLP)},
YEAR = {2006},
PAGES = {2506--2509},
ADDRESS = {Pittsburgh PA, USA},
MONTH = {September},
ABSTRACT = {Due to the physiological constraints of articulatory motion the speech
apparatus has limited degrees of freedom. As a result, the range
of speech sounds a human is capable of producing may lie on a low
dimensional submanifold of the high dimensional space of all possible
sounds. In this study a number of manifold learning algorithms are
applied to speech data in an effort to extract useful low dimensional
structure from the high dimensional speech signal. The ability of
these manifold learning algorithms to separate vowels in a low dimensional
space is evaluated and compared to a classical linear dimensionality
reduction method. Results indicate that manifold learning algorithms
outperform classical methods in low dimensions and are capable of
discovering useful manifold structure in speech data.},
KEYWORDS = {ISOMAP, LLE, nonlinear dimensionality reduction, speech dimensionality},
OWNER = {aerrity},
TIMESTAMP = {2006.06.21},
URL = {http://www.isca-speech.org/archive/archive_papers/interspeech_2006/i06_1667.pdf}
}
@INPROCEEDINGS{Errity2004,
AUTHOR = {A. Errity and J. McKenna and S. Isard},
TITLE = {Unscented {K}alman Filtering of Line Spectral Frequencies},
BOOKTITLE = {Proc. of the Int. Conf. on Spoken Language Processing (Interspeech
2004 - ICSLP)},
YEAR = {2004},
PAGES = {2697--2700},
ADDRESS = {Jeju, Korea},
MONTH = {October},
ABSTRACT = {We propose a new method for estimating Line Spectral Frequency (LSF)
trajectories that uses unscented {K}alman filtering (UKF). This method
is based upon an iterative Expectation Maximisation (EM) approach
in which LSF estimates are generated during a forward pass and then
smoothed during a backward pass. The EM approach also provides re-estimated
{K}alman filter parameters for further forward-backward passes that
improve estimation. This approach exploits the non-independence of
neighbouring spectra. We estimate LSFs as they have good interpolation
and quantization properties. This allows us to estimate LSF trajectories
that are guaranteed to result in stable filters. We analyse noisy
synthetic speech using this technique. The results compare favourably
with other methods.},
OWNER = {aerrity},
TIMESTAMP = {2008.02.09},
URL = {http://www.isca-speech.org/archive/archive_papers/interspeech_2004/i04_2697.pdf}
}
@INPROCEEDINGS{Errity2007Springer,
AUTHOR = {Andrew Errity and John McKenna and Barry Kirkpatrick},
TITLE = {Manifold Learning-Based Feature Transformation for Phone Classification},
BOOKTITLE = {Advances in Nonlinear Speech Processing, International Conference
on Non-Linear Speech Processing, NOLISP 2007, Paris, France, May
22-25, 2007, Revised Selected Papers},
YEAR = {2007},
EDITOR = {Mohamed Chetouani and Amir Hussain and Bruno Gas and Maurice Milgram
and Jean-Luc Zarader},
VOLUME = {4885},
SERIES = {Lecture Notes in Computer Science},
PAGES = {132--141},
PUBLISHER = {Springer},
ABSTRACT = {This paper investigates approaches for low dimensional speech feature
transformation using manifold learning. It has recently been shown
that speech sounds may exist on a low dimensional manifold nonlinearly
embedded in high dimensional space. A number of techniques have been
developed in recent years that attempt to discover the geometric
structure of the underlying low dimensional manifold. The manifold
learning techniques locally linear embedding and {I}somap are considered
in this study. The low dimensional feature representations produced
by these techniques are applied to several phone classification tasks
on the TIMIT corpus. Classification accuracy is analysed and compared
to conventional MFCC features and PCA, a linear dimensionality reduction
method, transformed features. It is shown that features resulting
from manifold learning are capable of yielding higher classification
accuracy than these baseline features. The best phone classification
accuracy in general is demonstrated by feature transformation with
{I}somap.},
OWNER = {ame},
TIMESTAMP = {2008.02.09},
URL = {http://www.springerlink.com/content/d41l8t1475608612/fulltext.pdf}
}
@INPROCEEDINGS{Errity2007,
AUTHOR = {A. Errity and J. McKenna and B. Kirkpatrick},
TITLE = {Manifold learning-based feature transformation for phone classification},
BOOKTITLE = {Proc. of the ISCA Tutorial and Research Workshop on Nonlinear Speech
Processing (NOLISP)},
YEAR = {2007},
PAGES = {43--46},
ADDRESS = {Paris, France},
MONTH = {May},
ABSTRACT = {This paper investigates approaches for low dimensional speech feature
transformation using manifold learning. It has recently been shown
that speech sounds may exist on a low dimensional manifold nonlinearly
embedded in high dimensional space. A number of techniques have been
developed in recent years that attempt to discover the geometric
structure of the underlying low dimensional manifold. The manifold
learning techniques locally linear embedding and {I}somap are considered
in this study. The low dimensional feature representations produced
by these techniques are applied to several phone classification tasks
on the TIMIT corpus. Classification accuracy is analysed and compared
to conventional MFCC features and PCA, a linear dimensionality reduction
method, transformed features. It is shown that features resulting
from manifold learning are capable of yielding higher classification
accuracy than these baseline features. The best phone classification
accuracy in general is demonstrated by feature transformation with
{I}somap.},
OWNER = {aerrity},
TIMESTAMP = {2007.04.24},
URL = {http://www.isca-speech.org/archive/archive_papers/nolisp07/nol7_039.pdf}
}
@INPROCEEDINGS{Errity2007b,
AUTHOR = {A. Errity and J. McKenna and B. Kirkpatrick},
TITLE = {Dimensionality reduction methods applied to both magnitude and phase
derived features},
BOOKTITLE = {Proc. of Interspeech 2007 - Eurospeech},
YEAR = {2007},
PAGES = {1957--1960},
ADDRESS = {Antwerp, Belgium},
MONTH = {August},
ABSTRACT = {A number of previous studies have shown that speech sounds may have
an intrinsic low dimensional structure. Such studies have focused
on magnitude-based features ignoring phase information, as is the
convention in many speech processing applications. In this paper
dimensionality reduction methods are applied to MFCC and modified
group delay function (MODGDF) features derived from the magnitude
and phase spectrum, respectively. The low dimensional structure of
these representations is examined and a method to combine these features
is detailed. Results show that both magnitude and phase derived features
have a low dimensional structure. MFCCs are found to offer higher
accuracy than MODGDFs in phone classification tasks. Results indicate
that combining MFCCs and MODGDFs gives improvements for phone classification.
PCA is shown to be capable of efficiently combining MFCCs and MODGDFs
for improved classification accuracy without large increases in feature
dimensionality.},
KEYWORDS = {manifold learning, dimensionality reduction, phase, modified group
delay function},
OWNER = {ame},
TIMESTAMP = {2007.05.31},
URL = {http://www.isca-speech.org/archive/archive_papers/interspeech_2007/i07_1957.pdf}
}
@INPROCEEDINGS{Kirkpatrick2007,
AUTHOR = {B. Kirkpatrick and D. O'Brien and R. Scaife and A. Errity},
TITLE = {Spectral dynamics as a source of discontinuity in concatenative speech
synthesis},
BOOKTITLE = {Proc. of the 15th Int. Conf. on Digital Signal Processing (DSP)},
YEAR = {2007},
PAGES = {615--618},
ADDRESS = {Cardiff, Wales},
MONTH = {July},
ABSTRACT = {The quality of concatenative speech synthesis depends on the cost
function employed for unit selection. Effective cost functions for
spectral continuity have proven difficult to define and standard
measures do not accurately reflect human perception of spectral discontinuity
in concatenated speech. Previous studies on spectral join costs have
focused predominantly on static spectral measures extracted from
the unit boundary. In this paper spectral dynamic behaviour is investigated
as a source of discontinuity in concatenated speech. A number of
measures representing spectral dynamics are tested for the task of
detecting discontinuities. The spectral dynamic measures tested contain
information correlating with human perception of discontinuities,
suggesting that spectral dynamics are a source of discontinuity in
concatenated speech. A strategy to effectively combine dynamic and
static measures is proposed using principal component analysis (PCA).},
OWNER = {aerrity},
TIMESTAMP = {2007.04.24},
URL = {http://ieeexplore.ieee.org/iel5/4288490/4288491/04288657.pdf}
}
@INPROCEEDINGS{Kirkpatrick2007a,
AUTHOR = {B. Kirkpatrick and D. O'Brien and R. Scaife and A. Errity},
TITLE = {On the Role of Spectral Dynamics in Unit Selection Speech Synthesis},
BOOKTITLE = {Proc. of Interspeech 2007 - Eurospeech},
YEAR = {2007},
PAGES = {2889--2892},
ADDRESS = {Antwerp, Belgium},
MONTH = {August},
OWNER = {ame},
TIMESTAMP = {2007.05.31},
URL = {http://www.isca-speech.org/archive/archive_papers/interspeech_2007/i07_2889.pdf}
}
@COMMENT{{jabref-meta: selector_publisher:}}
@COMMENT{{jabref-meta: selector_author:}}
@COMMENT{{jabref-meta: selector_journal:}}
@COMMENT{{jabref-meta: selector_keywords:}}
This file has been generated by bibtex2html 1.74