@TECHREPORT\{IMM2007-05378,
author = "M. M{\o}rup and M. N. Schmidt",
title = "Shift Invariant Sparse Coding of Image and Music Data",
year = "2007",
number = "",
series = "",
institution = "Informatics and Mathematical Modelling, Technical University of Denmark, {DTU}",
address = "Richard Petersens Plads, Building 321, {DK-}2800 Kgs. Lyngby",
type = "",
url = "http://www2.imm.dtu.dk/pubdb/p.php?5378",
abstract = "Sparse coding is a well established principle for unsupervised learning. Traditionally, features are extracted in sparse coding in specific locations, however, often we would prefer a shift invariant representation. This paper introduces the shift invariant sparse coding (SISC) model. The model decomposes an image into shift invariant feature images as well as a sparse coding matrix indicating where and to what degree in the original image these features are present. The model is not only useful, for analyzing shift invariant structures in image data, but also for analyzing the amplitude spectrogram of audio signals since a change in pitch relates to a shift in a logarithmic frequency axis. The {SISC} model is extended to handle data from several channels under the assumption that each feature is linearly mixed into the channels. For image analysis this implies that each feature has a fixed color coding for all locations. While for analysis of audio signals it means that features have fixed spatial position. The model is overcomplete and we therefore invoke sparse coding. The optimal degree of sparseness is estimated by an {'L-}curve'-like argument. We propose to use the sparsity parameter that maximizes the curvature in the graph of the residual sum of squares plotted against the number of non-zero elements in the sparse coding matrix. With this choice of regularization, the algorithm can correctly identify components of non-trivial artificial as well as real image and audio data. For image data, the algorithm identify relevant patterns and the sparse coding matrix indicates where and to what degree these patterns are present. When applied to music, the model can identify the harmonic structures of instruments, while the sparse coding matrix accounts for the notes played."
}