@CONFERENCE\{IMM2007-04871, author = "L. Feng and L. K. Hansen", title = "{COGNITIVE} {COMPONENTS} {OF} {SPEECH} {AT} {DIFFERENT} {TIME} {SCALES}", year = "2007", month = "aug", keywords = "Cognitive component analysis; time scales; energy based sparsification; statistical regularity; unsupervised learning; supervised learning", booktitle = "CogSci 2007", volume = "", series = "", editor = "", publisher = "", organization = "", address = "", url = "http://www2.compute.dtu.dk/pubdb/pubs/4871-full.html", abstract = "Cognitive component analysis (COCA) is defined as unsupervised grouping of data leading to a group structure wellaligned with that resulting from human cognitive activity. We focus here on speech at different time scales looking for possible hidden ‘cognitive structure’. Statistical regularities have earlier been revealed at multiple time scales corresponding to: phoneme, gender, height and speaker identity. We here show that the same simple unsupervised earning algorithm can detect these cues. Our basic features are 25-dimensional shorttime Mel-frequency weighted cepstral coefficients, assumed to model the basic representation of the human auditory system. The basic features are aggregated in time to obtain features at longer time scales. Simple energy based filtering is used to achieve a sparse representation. Our hypothesis is now basically ecological: We hypothesize that features that are essentially independent in a reasonable ensemble can be efficiently coded using a sparse independent component representation. The representations are indeed shown to be very similar between supervised learning (invoking cognitive activity) and unsupervised learning (statistical regularities), hence lending additional support to our cognitive component hypothesis." }