{"id":"https://openalex.org/W4415971058","doi":"https://doi.org/10.1109/taffc.2025.3629970","title":"Describe Where You Are: Improving Noise-Robustness for Speech Emotion Recognition With Text Description of the Environment","display_name":"Describe Where You Are: Improving Noise-Robustness for Speech Emotion Recognition With Text Description of the Environment","publication_year":2025,"publication_date":"2025-11-06","ids":{"openalex":"https://openalex.org/W4415971058","doi":"https://doi.org/10.1109/taffc.2025.3629970","pmid":"https://pubmed.ncbi.nlm.nih.gov/42023408"},"language":"en","primary_location":{"id":"doi:10.1109/taffc.2025.3629970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2025.3629970","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC13099230/","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024273749","display_name":"Seong-Gyun Leem","orcid":"https://orcid.org/0000-0002-1175-1577"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]},{"id":"https://openalex.org/I4210128585","display_name":"META Health","ror":"https://ror.org/035h67p10","country_code":"US","type":"other","lineage":["https://openalex.org/I4210128585"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Seong-Gyun Leem","raw_affiliation_strings":["Meta Reality Labs, Redmond, WA, USA","Department of Electrical and Computer Engineering, The University of Texas at Dallas, Richardson, TX, USA"],"raw_orcid":"https://orcid.org/0000-0002-1175-1577","affiliations":[{"raw_affiliation_string":"Meta Reality Labs, Redmond, WA, USA","institution_ids":["https://openalex.org/I4210128585"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, The University of Texas at Dallas, Richardson, TX, USA","institution_ids":["https://openalex.org/I162577319"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026166163","display_name":"Daniel Fulford","orcid":"https://orcid.org/0000-0003-4405-9031"},"institutions":[{"id":"https://openalex.org/I111088046","display_name":"Boston University","ror":"https://ror.org/05qwgg493","country_code":"US","type":"education","lineage":["https://openalex.org/I111088046"]},{"id":"https://openalex.org/I897549280","display_name":"Hologic (Germany)","ror":"https://ror.org/007zn4n28","country_code":"DE","type":"company","lineage":["https://openalex.org/I4210117952","https://openalex.org/I897549280"]}],"countries":["DE","US"],"is_corresponding":false,"raw_author_name":"Daniel Fulford","raw_affiliation_strings":["Occupational Therapy and Psychological and Brain Sciences, Boston University, Boston, MA, USA","Occupational Therapy and Psychological and Brain Sciences, Boston University, MA, USA"],"raw_orcid":"https://orcid.org/0000-0003-4405-9031","affiliations":[{"raw_affiliation_string":"Occupational Therapy and Psychological and Brain Sciences, Boston University, Boston, MA, USA","institution_ids":["https://openalex.org/I111088046"]},{"raw_affiliation_string":"Occupational Therapy and Psychological and Brain Sciences, Boston University, MA, USA","institution_ids":["https://openalex.org/I897549280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041695925","display_name":"Jukka\u2010Pekka Onnela","orcid":"https://orcid.org/0000-0001-6613-8668"},"institutions":[{"id":"https://openalex.org/I136199984","display_name":"Harvard University","ror":"https://ror.org/03vek6s52","country_code":"US","type":"education","lineage":["https://openalex.org/I136199984"]},{"id":"https://openalex.org/I4210101190","display_name":"Cancer Research And Biostatistics","ror":"https://ror.org/01575p865","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210101190"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jukka-Pekka Onnela","raw_affiliation_strings":["Department of Biostatistics, Harvard T.H. Chan School of Public Health, Harvard University, Cambridge, MA, USA","Department of Biostatistics, Harvard T.H. Chan School of Public Health, Harvard University, MA, USA"],"raw_orcid":"https://orcid.org/0000-0001-6613-8668","affiliations":[{"raw_affiliation_string":"Department of Biostatistics, Harvard T.H. Chan School of Public Health, Harvard University, Cambridge, MA, USA","institution_ids":["https://openalex.org/I136199984"]},{"raw_affiliation_string":"Department of Biostatistics, Harvard T.H. Chan School of Public Health, Harvard University, MA, USA","institution_ids":["https://openalex.org/I136199984","https://openalex.org/I4210101190"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007724150","display_name":"David E. Gard","orcid":"https://orcid.org/0000-0002-0446-4000"},"institutions":[{"id":"https://openalex.org/I71838634","display_name":"San Francisco State University","ror":"https://ror.org/05ykr0121","country_code":"US","type":"education","lineage":["https://openalex.org/I71838634"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"David Gard","raw_affiliation_strings":["Psychology Department, San Francisco State University, San Francisco, CA, USA","Psychology Department, San Francisco State University, CA, USA"],"raw_orcid":"https://orcid.org/0000-0002-0446-4000","affiliations":[{"raw_affiliation_string":"Psychology Department, San Francisco State University, San Francisco, CA, USA","institution_ids":["https://openalex.org/I71838634"]},{"raw_affiliation_string":"Psychology Department, San Francisco State University, CA, USA","institution_ids":["https://openalex.org/I71838634"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040793194","display_name":"Carlos Busso","orcid":"https://orcid.org/0000-0002-4075-4072"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Carlos Busso","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0002-4075-4072","affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.4863,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.94819122,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"17","issue":"1","first_page":"656","last_page":"669"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9722999930381775,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9722999930381775,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10664","display_name":"Sentiment Analysis and Opinion Mining","score":0.006399999838322401,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.005100000184029341,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6948999762535095},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6872000098228455},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.6363000273704529},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.5493999719619751},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5126000046730042},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5055000185966492},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4975999891757965},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.4415999948978424}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7294999957084656},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6948999762535095},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6872000098228455},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.642300009727478},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.6363000273704529},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5166000127792358},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5126000046730042},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5055000185966492},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4975999891757965},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4934000074863434},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.4415999948978424},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.36980000138282776},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.34540000557899475},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.33869999647140503},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30399999022483826},{"id":"https://openalex.org/C2988148770","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion detection","level":3,"score":0.29179999232292175},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.287200003862381},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/taffc.2025.3629970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2025.3629970","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"},{"id":"pmid:42023408","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/42023408","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on affective computing","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:13099230","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC13099230/","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Trans Affect Comput","raw_type":"Text"}],"best_oa_location":{"id":"pmh:oai:pubmedcentral.nih.gov:13099230","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC13099230/","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Trans Affect Comput","raw_type":"Text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"<italic":[0,126,144,170],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[1,127,145,171],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Speech":[2],"emotion":[3,164],"recognition</i>":[4],"(SER)":[5],"systems":[6],"often":[7],"struggle":[8],"in":[9],"real-world":[10,104],"environments,":[11],"where":[12,49],"ambient":[13],"noise":[14,62,106],"severely":[15],"degrades":[16],"their":[17,60],"performance.":[18],"This":[19],"paper":[20],"explores":[21],"a":[22,45,66,81,125,143],"novel":[23],"approach":[24,95],"that":[25,118,134],"exploits":[26],"prior":[27],"knowledge":[28],"of":[29,93,138],"testing":[30],"environments":[31],"to":[32,70,80],"maximize":[33],"SER":[34,51,83,140],"performance":[35],"under":[36],"noisy":[37],"conditions.":[38],"To":[39],"address":[40],"this":[41],"task,":[42],"we":[43],"propose":[44],"text-guided,":[46],"environment-aware":[47],"training":[48,86],"an":[50],"model":[52,84],"is":[53],"trained":[54],"with":[55,99,162],"contaminated":[56],"speech":[57],"samples":[58,107],"and":[59,76,87,103,112,190],"paired":[61],"description.":[63],"We":[64,89],"use":[65],"pre-trained":[67],"text":[68,160,178],"encoder":[69,161,179],"extract":[71],"the":[72,91,100,110,119,136,139,159,163,168,177],"text-based":[73,120],"environment":[74,121],"embedding":[75],"then":[77],"fuse":[78],"it":[79],"transformer-based":[82],"during":[85],"inference.":[88],"demonstrate":[90],"effectiveness":[92],"our":[94,97,150,181],"through":[96],"experiment":[98,116],"MSP-Podcast":[101],"corpus":[102],"additive":[105],"collected":[108],"from":[109],"Freesound":[111],"DEMAND":[113],"repositories.":[114],"Our":[115],"indicates":[117],"descriptions":[122],"processed":[123],"by":[124,156,185],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">large":[128],"language":[129],"model</i>":[130],"(LLM)":[131],"produce":[132],"representations":[133],"improve":[135],"noise-robustness":[137],"system.":[141],"With":[142],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">contrastive":[146],"learning</i>":[147],"(CL)-based":[148],"representation,":[149],"proposed":[151],"method":[152,184],"can":[153],"be":[154],"improved":[155],"jointly":[157],"fine-tuning":[158,176],"recognition":[165],"model.":[166],"Under":[167],"-5dB":[169],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">signal-to-noise":[172],"ratio</i>":[173],"(SNR)":[174],"level,":[175],"improves":[180],"CL-based":[182],"representation":[183],"76.4%":[186],"(arousal),":[187],"100.0%":[188],"(dominance),":[189],"27.7%":[191],"(valence).":[192]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-06T00:00:00"}
