Publications

Fujimura, T., Wichern, G., Masuyama, Y., Boeddeker, C., Saijo, K., Richter, J., Edo, T., Le Roux, J., "The MERL Systems for DCASE 2026 Challenge Task 2," Tech. Rep. TR2026-100, IEEE AASP Challenge on Detection and Classification of Acoustic Scenes and Events (DCASE Challenge), June 2026.
BibTeX TR2026-100 PDF
- @techreport{Fujimura2026jun,
- author = {{Fujimura, Takuya and Wichern, Gordon and Masuyama, Yoshiki and Boeddeker, Christoph and Saijo, Kohei and Richter, Julius and Edo, Takahiro and Le Roux, Jonathan}},
- title = {{The MERL Systems for DCASE 2026 Challenge Task 2}},
- institution = {IEEE AASP Challenge on Detection and Classification of Acoustic Scenes and Events (DCASE Challenge)},
- year = 2026,
- month = jun,
- url = {https://www.merl.com/publications/TR2026-100}
- }
Saijo, K., Masuyama, Y., Boeddeker, C., Wichern, G., Richter, J., Edo, T., Le Roux, J., "The MERL Systems for DCASE 2026 Challenge Task 4," Tech. Rep. TR2026-098, IEEE AASP Challenge on Detection and Classification of Acoustic Scenes and Events (DCASE Challenge), June 2026.
BibTeX TR2026-098 PDF
- @techreport{Saijo2026jun,
- author = {{Saijo, Kohei and Masuyama, Yoshiki and Boeddeker, Christoph and Wichern, Gordon and Richter, Julius and Edo, Takahiro and Le Roux, Jonathan}},
- title = {{The MERL Systems for DCASE 2026 Challenge Task 4}},
- institution = {IEEE AASP Challenge on Detection and Classification of Acoustic Scenes and Events (DCASE Challenge)},
- year = 2026,
- month = jun,
- url = {https://www.merl.com/publications/TR2026-098}
- }
Aihara, R., Masuyama, Y., Paissan, F., Germain, F.G., Wichern, G., Le Roux, J., "SUNAC: Source-aware Unified Neural Audio Codec", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP55912.2026.11461849, May 2026, pp. 14427-14431.
BibTeX TR2026-032 PDF
- @inproceedings{Aihara2026may,
- author = {Aihara, Ryo and Masuyama, Yoshiki and Paissan, Francesco and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{SUNAC: Source-aware Unified Neural Audio Codec}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- pages = {14427--14431},
- month = may,
- doi = {10.1109/ICASSP55912.2026.11461849},
- url = {https://www.merl.com/publications/TR2026-032}
- }
Han, J., Wang, R., Masuyama, Y., Delcroix, M., Rohdin, J., Du, J., Burget, L., "Spatially Aware Self-Supervised Models for Multi-Channel Neural Speaker Diarization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP55912.2026.11463023, May 2026, pp. 17447-17451.
BibTeX TR2026-047 PDF
- @inproceedings{Han2026may,
- author = {Han, Jiangyu and Wang, Ruoyu and Masuyama, Yoshiki and Delcroix, Marc and Rohdin, Johan and Du, Jun and Burget, Lukáš},
- title = {{Spatially Aware Self-Supervised Models for Multi-Channel Neural Speaker Diarization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- pages = {17447--17451},
- month = may,
- doi = {10.1109/ICASSP55912.2026.11463023},
- url = {https://www.merl.com/publications/TR2026-047}
- }
Masuyama, Y., Germain, F.G., Wichern, G., Hori, C., Le Roux, J., "Velocity Potential Neural Field for Efficient Ambisonics Impulse Response Modeling", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP55912.2026.11460631, May 2026, pp. 22582-22586.
BibTeX TR2026-033 PDF
- @inproceedings{Masuyama2026may,
- author = {Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Velocity Potential Neural Field for Efficient Ambisonics Impulse Response Modeling}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- pages = {22582--22586},
- month = may,
- doi = {10.1109/ICASSP55912.2026.11460631},
- url = {https://www.merl.com/publications/TR2026-033}
- }
Masuyama, Y., Saijo, K., Paissan, F., Han, J., Delcroix, M., Aihara, R., Germain, F.G., Wichern, G., Le Roux, J., "FlexIO: Flexible Single- and Multi-Channel Speech Separation and Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP55912.2026.11462393, May 2026, pp. 14417-14421.
BibTeX TR2026-034 PDF
- @inproceedings{Masuyama2026may2,
- author = {Masuyama, Yoshiki and Saijo, Kohei and Paissan, Francesco and Han, Jiangyu and Delcroix, Marc and Aihara, Ryo and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{FlexIO: Flexible Single- and Multi-Channel Speech Separation and Enhancement}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- pages = {14417--14421},
- month = may,
- doi = {10.1109/ICASSP55912.2026.11462393},
- url = {https://www.merl.com/publications/TR2026-034}
- }
Aihara, R., Masuyama, Y., Germain, F.G., Wichern, G., Le Roux, J., "Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations", IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW), DOI: 10.1109/ICASSP55912.2026.11462776, May 2026, pp. 21992-21996.
BibTeX TR2026-035 PDF
- @inproceedings{Aihara2026may2,
- author = {Aihara, Ryo and Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)},
- year = 2026,
- pages = {21992--21996},
- month = may,
- doi = {10.1109/ICASSP55912.2026.11462776},
- url = {https://www.merl.com/publications/TR2026-035}
- }
Jeon, C.-B., Wichern, G., Germain, F.G., Le Roux, J., "Embracing Cacophony: Explaining and Improving Random Mixing in Music Source Separation", IEEE Open Journal of Signal Processing, DOI: 10.1109/OJSP.2025.3633567, Vol. 6, pp. 1179-1192, January 2026.
BibTeX TR2026-012 PDF Software
- @article{Jeon2026jan,
- author = {Jeon, Chang-Bin and Wichern, Gordon and Germain, François G and {Le Roux}, Jonathan},
- title = {{Embracing Cacophony: Explaining and Improving Random Mixing in Music Source Separation}},
- journal = {IEEE Open Journal of Signal Processing},
- year = 2026,
- volume = 6,
- pages = {1179--1192},
- month = jan,
- doi = {10.1109/OJSP.2025.3633567},
- url = {https://www.merl.com/publications/TR2026-012}
- }
Wilkinghoff, K., Yang, H., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Local Density-Based Anomaly Score Normalization for Domain Generalization", IEEE Transactions on Audio, Speech and Language Processing, DOI: 10.1109/TASLPRO.2025.3629236, Vol. 33, pp. 4642-4652, January 2026.
BibTeX TR2026-010 PDF Software
- @article{Wilkinghoff2026jan,
- author = {Wilkinghoff, Kevin and Yang, Haici and Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Local Density-Based Anomaly Score Normalization for Domain Generalization}},
- journal = {IEEE Transactions on Audio, Speech and Language Processing},
- year = 2026,
- volume = 33,
- pages = {4642--4652},
- month = jan,
- doi = {10.1109/TASLPRO.2025.3629236},
- issn = {2998-4173},
- url = {https://www.merl.com/publications/TR2026-010}
- }
Cornell, S., Boeddeker, C., Park, T., Huang, H., Raj, D., Wiesner, M., Masuyama, Y., Chang, X., Wang, Z.-Q., Squartini, S., Garcia, P., Watanabe, S., "Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101901, Vol. 97, pp. 101901, December 2025.
BibTeX TR2026-008 PDF
- @article{Cornell2025dec,
- author = {Cornell, Samuele and Boeddeker, Christoph and Park, Taejin and Huang, He and Raj, Desh and Wiesner, Matthew and Masuyama, Yoshiki and Chang, Xuankai and Wang, Zhong-Qiu and Squartini, Stefano and Garcia, Paola and Watanabe, Shinji},
- title = {{Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 97,
- pages = 101901,
- month = dec,
- doi = {10.1016/j.csl.2025.101901},
- url = {https://www.merl.com/publications/TR2026-008}
- }
Masuyama, Y., Wichern, G., Germain, F.G., Ick, C., Le Roux, J., "SuDaField: Subject- and Dataset-Aware Neural Field for HRTF Modeling", IEEE Open Journal of Signal Processing, DOI: 10.1109/OJSP.2025.3627073, Vol. 6, pp. 1169-1178, December 2025.
BibTeX TR2026-009 PDF Software
- @article{Masuyama2025dec2,
- author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Ick, Christopher and {Le Roux}, Jonathan},
- title = {{SuDaField: Subject- and Dataset-Aware Neural Field for HRTF Modeling}},
- journal = {IEEE Open Journal of Signal Processing},
- year = 2025,
- volume = 6,
- pages = {1169--1178},
- month = dec,
- doi = {10.1109/OJSP.2025.3627073},
- url = {https://www.merl.com/publications/TR2026-009}
- }
Masuyama, Y., Wichern, G., Germain, F.G., Ick, C., Le Roux, J., "RANF: Neural Field-Based HRTF Spatial Upsampling with Retrieval Augmentation and Parameter Efficient Fine-Tuning", IEEE Open Journal of Signal Processing, DOI: 10.1109/OJSP.2025.3640517, Vol. 7, pp. 32-41, December 2025.
BibTeX TR2026-007 PDF Software
- @article{Masuyama2025dec,
- author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Ick, Christopher and {Le Roux}, Jonathan},
- title = {{RANF: Neural Field-Based HRTF Spatial Upsampling with Retrieval Augmentation and Parameter Efficient Fine-Tuning}},
- journal = {IEEE Open Journal of Signal Processing},
- year = 2025,
- volume = 7,
- pages = {32--41},
- month = dec,
- doi = {10.1109/OJSP.2025.3640517},
- url = {https://www.merl.com/publications/TR2026-007}
- }
Hori, C., Masuyama, Y., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Robot Confirmation Generation and Action Planning Using Long-context Q-Former Integrated with Multimodal LLM", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU65441.2025.11434641, December 2025.
BibTeX TR2025-167 PDF
- @inproceedings{Hori2025dec,
- author = {Hori, Chiori and Masuyama, Yoshiki and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Robot Confirmation Generation and Action Planning Using Long-context Q-Former Integrated with Multimodal LLM}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2025,
- month = dec,
- doi = {10.1109/ASRU65441.2025.11434641},
- issn = {2997-6995},
- isbn = {979-8-3315-4426-3},
- url = {https://www.merl.com/publications/TR2025-167}
- }
Masuyama, Y., "Neural Fields for Spatial Audio Modeling," Tech. Rep. TR2025-171, Speech and Audio in the Northeast (SANE), November 2025.
BibTeX TR2025-171 PDF
- @techreport{Masuyama2025nov,
- author = {Masuyama, Yoshiki},
- title = {{Neural Fields for Spatial Audio Modeling}},
- institution = {Speech and Audio in the Northeast (SANE)},
- year = 2025,
- month = nov,
- url = {https://www.merl.com/publications/TR2025-171}
- }
Wilkinghoff, K., Fujimura, T., Imoto, K., Le Roux, J., Tan, Z.-H., Toda, T., "Handling Domain Shifts for Anomalous Sound Detection: A Review of DCASE-Related Work", Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE), DOI: 10.5281/zenodo.17251589, October 2025, pp. 20-24.
BibTeX TR2025-157 PDF
- @inproceedings{Wilkinghoff2025oct,
- author = {Wilkinghoff, Kevin and Fujimura, Takuya and Imoto, Keisuke and {Le Roux}, Jonathan and Tan, Zheng-Hua and Toda, Tomoki},
- title = {{Handling Domain Shifts for Anomalous Sound Detection: A Review of DCASE-Related Work}},
- booktitle = {Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)},
- year = 2025,
- pages = {20--24},
- month = oct,
- doi = {10.5281/zenodo.17251589},
- isbn = {978-84-09-77652-8},
- url = {https://www.merl.com/publications/TR2025-157}
- }
Masuyama, Y., Germain, F.G., Wichern, G., Ick, C., Le Roux, J., "Physics-Informed Direction-Aware Neural Acoustic Fields", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA66052.2025.11230918, October 2025.
BibTeX TR2025-142 PDF
- @inproceedings{Masuyama2025oct,
- author = {Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and Ick, Christopher and {Le Roux}, Jonathan},
- title = {{Physics-Informed Direction-Aware Neural Acoustic Fields}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2025,
- month = oct,
- doi = {10.1109/WASPAA66052.2025.11230918},
- url = {https://www.merl.com/publications/TR2025-142}
- }
Paissan, F., Wichern, G., Masuyama, Y., Aihara, R., Germain, F.G., Saijo, K., Le Roux, J., "FasTUSS: Faster Task-Aware Unified Source Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA66052.2025.11230943, October 2025.
BibTeX TR2025-143 PDF
- @inproceedings{Paissan2025oct,
- author = {Paissan, Francesco and Wichern, Gordon and Masuyama, Yoshiki and Aihara, Ryo and Germain, François G and Saijo, Kohei and {Le Roux}, Jonathan},
- title = {{FasTUSS: Faster Task-Aware Unified Source Separation}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2025,
- month = oct,
- doi = {10.1109/WASPAA66052.2025.11230943},
- url = {https://www.merl.com/publications/TR2025-143}
- }
Hussein, A., Khurana, S., Wichern, G., Germain, F.G., Le Roux, J., "HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement", Interspeech, DOI: 10.21437/Interspeech.2025-2063, August 2025, pp. 5393-5397.
BibTeX TR2025-122 PDF
- @inproceedings{Hussein2025aug,
- author = {Hussein, Amir and Khurana, Sameer and Wichern, Gordon and Germain, François G and {Le Roux}, Jonathan},
- title = {{HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {5393--5397},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2025-2063},
- url = {https://www.merl.com/publications/TR2025-122}
- }
Ick, C., Wichern, G., Masuyama, Y., Germain, F.G., Le Roux, J., "Direction-Aware Neural Acoustic Fields for Few-Shot Interpolation of Ambisonic Impulse Responses", Interspeech, DOI: 10.21437/Interspeech.2025-1912, August 2025, pp. 933-937.
BibTeX TR2025-120 PDF
- @inproceedings{Ick2025aug,
- author = {Ick, Christopher and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and {Le Roux}, Jonathan},
- title = {{Direction-Aware Neural Acoustic Fields for Few-Shot Interpolation of Ambisonic Impulse Responses}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {933--937},
- month = aug,
- doi = {10.21437/Interspeech.2025-1912},
- url = {https://www.merl.com/publications/TR2025-120}
- }
Khurana, S., Klement, D., Laurent, A., Bobos, D., Novosad, J., Gazdik, P., Zhang, E., Huang, Z., Hussein, A., Marxer, R., Masuyama, Y., Aihara, R., Hori, C., Germain, F.G., Wichern, G., Le Roux, J., "Factorized RVQ-GAN For Disentangled Speech Tokenization", Interspeech, DOI: 10.21437/Interspeech.2025-2612, August 2025, pp. 3514-3518.
BibTeX TR2025-123 PDF
- @inproceedings{Khurana2025aug,
- author = {Khurana, Sameer and Klement, Dominik and Laurent, Antoine and Bobos, Dominik and Novosad, Juraj and Gazdik, Peter and Zhang, Ellen and Huang, Zilli and Hussein, Amir and Marxer, Ricard and Masuyama, Yoshiki and Aihara, Ryo and Hori, Chiori and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Factorized RVQ-GAN For Disentangled Speech Tokenization}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {3514--3518},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2025-2612},
- url = {https://www.merl.com/publications/TR2025-123}
- }
Yang, H., Wichern, G., Aihara, R., Masuyama, Y., Khurana, S., Germain, F.G., Le Roux, J., "Investigating Continuous Autoregressive Generative Speech Enhancement", Interspeech, DOI: doi: 10.21437/Interspeech.2025-2335, August 2025, pp. 2360-2364.
BibTeX TR2025-119 PDF
- @inproceedings{Yang2025aug,
- author = {Yang, Haici and Wichern, Gordon and Aihara, Ryo and Masuyama, Yoshiki and Khurana, Sameer and Germain, François G and {Le Roux}, Jonathan},
- title = {{Investigating Continuous Autoregressive Generative Speech Enhancement}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {2360--2364},
- month = aug,
- publisher = {ISCA},
- doi = {doi: 10.21437/Interspeech.2025-2335},
- url = {https://www.merl.com/publications/TR2025-119}
- }
Steinmetz, C., Uhle, C., Everardo, F., Mitcheltree, C., McElveen, J.K., Jot, J.-M., Wichern, G., "Audio Signal Processing in the Artificial Intelligence Era: Challenges and Directions", Journal of the Audio Engineering Society, DOI: 10.17743/jaes.2022.0209, Vol. 73, No. 7/8, pp. 406-428, August 2025.
BibTeX TR2025-116 PDF
- @article{Steinmetz2025aug,
- author = {Steinmetz, Christian and Uhle, Christian and Everardo, Flavio and Mitcheltree, Christopher and McElveen, J. Keith and Jot, Jean-Marc and Wichern, Gordon},
- title = {{Audio Signal Processing in the Artificial Intelligence Era: Challenges and Directions}},
- journal = {Journal of the Audio Engineering Society},
- year = 2025,
- volume = 73,
- number = {7/8},
- pages = {406--428},
- month = aug,
- doi = {10.17743/jaes.2022.0209},
- url = {https://www.merl.com/publications/TR2025-116}
- }
Almudévar, A., Hernández-Lobato, J.M., Khurana, S., Marxer, R., Ortega, A., "Aligning Multimodal Representations through an Information Bottleneck", International Conference on Machine Learning (ICML), July 2025.
BibTeX TR2025-109 PDF
- @inproceedings{Almudévar2025jul,
- author = {Almudévar, Antonio and Hernández-Lobato, José, M and Khurana, Sameer and Marxer, Ricard and Ortega, Alfonso},
- title = {{Aligning Multimodal Representations through an Information Bottleneck}},
- booktitle = {International Conference on Machine Learning (ICML)},
- year = 2025,
- month = jul,
- url = {https://www.merl.com/publications/TR2025-109}
- }
Masuyama, Y., "Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition," Tech. Rep. TR2025-097, Jelinek Summer Workshop on Speech and Language Technology (JSALT), June 2025.
BibTeX TR2025-097 PDF
- @techreport{Masuyama2025jun,
- author = {{{Masuyama, Yoshiki}}},
- title = {{{Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition}}},
- institution = {Jelinek Summer Workshop on Speech and Language Technology (JSALT)},
- year = 2025,
- month = jun,
- url = {https://www.merl.com/publications/TR2025-097}
- }
Lu, K., Ma, C., Hori, C., Romeres, D., "KitchenVLA: Iterative Vision-Language Corrections for Robotic Execution of Human Tasks", IEEE International Conference on Robotics and Automation Workshop on Safely Leveraging Vision-Language Foundation Models in Robotics (SafeLVMs@ICRA), May 2025.
BibTeX TR2025-068 PDF
- @inproceedings{Lu2025may,
- author = {Lu, Kai and Ma, Chenyang and Hori, Chiori and Romeres, Diego},
- title = {{KitchenVLA: Iterative Vision-Language Corrections for Robotic Execution of Human Tasks}},
- booktitle = {IEEE International Conference on Robotics and Automation Workshop on Safely Leveraging Vision-Language Foundation Models in Robotics (SafeLVMs@ICRA)},
- year = 2025,
- month = may,
- url = {https://www.merl.com/publications/TR2025-068}
- }