Publications

Pan, Z., Wichern, G., Germain, F.G., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", Hands-free Speech Communication and Microphone Arrays (HSCMA), April 2024.
BibTeX TR2024-029 PDF
- @inproceedings{Pan2024apr,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Subramanian, Aswin and Le Roux, Jonathan},
- title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
- booktitle = {Hands-free Speech Communication and Microphone Arrays (HSCMA)},
- year = 2024,
- month = apr,
- url = {https://www.merl.com/publications/TR2024-029}
- }
Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers", arXiv, April 2024.
BibTeX arXiv
- @article{Koo2024apr2,
- author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
- title = {SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers},
- journal = {arXiv},
- year = 2024,
- month = apr,
- url = {https://arxiv.org/abs/2404.02252}
- }
Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), April 2024.
BibTeX TR2024-032 PDF
- @inproceedings{Koo2024apr,
- author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
- title = {Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads},
- booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
- year = 2024,
- month = apr,
- url = {https://www.merl.com/publications/TR2024-032}
- }
Jeon, C.-B., Wichern, G., Germain, F.G., Le Roux, J., "Why does music source separation benefit from cacophony?", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), March 2024.
BibTeX TR2024-030 PDF Video
- @inproceedings{Jeon2024mar,
- author = {Jeon, Chang-Bin and Wichern, Gordon and Germain, François G and Le Roux, Jonathan},
- title = {Why does music source separation benefit from cacophony?},
- booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-030}
- }
Bralios, D., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "Generation or Replication: Auscultating Audio Latent Diffusion Models", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-027 PDF
- @inproceedings{Bralios2024mar,
- author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-027}
- }
Masuyama, Y., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-026 PDF
- @inproceedings{Masuyama2024mar,
- author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-026}
- }
Pan, Z., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-025 PDF
- @inproceedings{Pan2024mar,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
- title = {NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-025}
- }
Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-028 PDF
- @inproceedings{Wu2024mar,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
- title = {Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-028}
- }
Baoueb, T., Liu, H., Fontaine, M., Le Roux, J., Richard, G., "SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-013 PDF
- @inproceedings{Baoueb2024mar,
- author = {Baoueb, Teysir and Liu, Haocheng and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
- title = {SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-013}
- }
Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447600, March 2024, pp. 13296-13300.
BibTeX TR2024-012 PDF
- @inproceedings{Hori2024mar,
- author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and Le Roux, Jonathan},
- title = {Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13296--13300},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447600},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-012}
- }
Liu, H., Baoueb, T., Fontaine, M., Le Roux, J., Richard, G., "GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
BibTeX TR2024-014 PDF
- @inproceedings{Liu2024mar,
- author = {Liu, Haocheng and Baoueb, Teysir and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
- title = {GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- month = mar,
- url = {https://www.merl.com/publications/TR2024-014}
- }
Boeddeker, C., Subramanian, A.S., Wichern, G., Haeb-Umbach, R., Le Roux, J., "TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2024.3350887, Vol. 32, pp. 1185-1197, February 2024.
BibTeX TR2024-006 PDF
- @article{Boeddeker2024feb,
- author = {Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan},
- title = {TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2024,
- volume = 32,
- pages = {1185--1197},
- month = feb,
- doi = {10.1109/TASLP.2024.3350887},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2024-006}
- }
Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU57964.2023.10389618, December 2023.
BibTeX TR2023-152 PDF
- @inproceedings{Pan2023dec2,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2023,
- month = dec,
- doi = {10.1109/ASRU57964.2023.10389618},
- isbn = {979-8-3503-0689-7},
- url = {https://www.merl.com/publications/TR2023-152}
- }
Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks", International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero), November 2023.
BibTeX TR2023-141 PDF
- @inproceedings{Wu2023nov,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
- title = {On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks},
- booktitle = {International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero)},
- year = 2023,
- month = nov,
- url = {https://www.merl.com/publications/TR2023-141}
- }
Pan, Z., Wichern, G., Germain, F., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", arXiv, DOI: 10.48550/arXiv.2211.01299, September 2023.
BibTeX arXiv
- @article{Pan2023sep,
- author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin and Le Roux, Jonathan},
- title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
- journal = {arXiv},
- year = 2023,
- month = sep,
- doi = {10.48550/arXiv.2211.01299},
- url = {https://arxiv.org/abs/2211.01299}
- }
Falcon Perez, R., Wichern, G., Germain, F., Le Roux, J., "Location as supervision for weakly supervised multi-channel source separation of machine sounds", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA58266.2023.10248128, September 2023.
BibTeX TR2023-119 PDF Presentation
- @inproceedings{FalconPerez2023aug,
- author = {Falcon Perez, Ricardo and Wichern, Gordon and Germain, Francois and Le Roux, Jonathan},
- title = {Location as supervision for weakly supervised multi-channel source separation of machine sounds},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2023,
- month = sep,
- publisher = {IEEE},
- doi = {10.1109/WASPAA58266.2023.10248128},
- issn = {1947-1629},
- isbn = {979-8-3503-2372-6},
- url = {https://www.merl.com/publications/TR2023-119}
- }
Germain, F., Wichern, G., Le Roux, J., "Hyperbolic Unsupervised Anomalous Sound Detection", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA58266.2023.10248092, September 2023.
BibTeX TR2023-108 PDF Video Presentation
- @inproceedings{Germain2023aug,
- author = {Germain, Francois and Wichern, Gordon and Le Roux, Jonathan},
- title = {Hyperbolic Unsupervised Anomalous Sound Detection},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2023,
- month = sep,
- publisher = {IEEE},
- doi = {10.1109/WASPAA58266.2023.10248092},
- issn = {1947-1629},
- isbn = {979-8-3503-2372-6},
- url = {https://www.merl.com/publications/TR2023-108}
- }
Petermann, D., Wichern, G., Subramanian, A.S., Wang, Z.-Q., Le Roux, J., "Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2023.3290428, Vol. 31, pp. 2592-2605, September 2023.
BibTeX TR2023-113 PDF
- @article{Petermann2023sep,
- author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Wang, Zhong-Qiu and Le Roux, Jonathan},
- title = {Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2023,
- volume = 31,
- pages = {2592--2605},
- month = sep,
- doi = {10.1109/TASLP.2023.3290428},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2023-113}
- }
Hori, C., Peng, P., Harwath, D., Liu, X., Ota, K., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos", Interspeech, DOI: 10.21437/Interspeech.2023-1983, August 2023, pp. 4663-4667.
BibTeX TR2023-104 PDF
- @inproceedings{Hori2023aug,
- author = {Hori, Chiori and Peng, Puyuang and Harwath, David and Liu, Xinyu and Ota, Kei and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and Le Roux, Jonathan},
- title = {Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos},
- booktitle = {Interspeech},
- year = 2023,
- pages = {4663--4667},
- month = aug,
- doi = {10.21437/Interspeech.2023-1983},
- url = {https://www.merl.com/publications/TR2023-104}
- }
Uhlich, S., Fabbro, G., Hirano, M., Takahashi, S., Wichern, G., Le Roux, J., Chakraborty, D., Mohanty, S., Li, K., Luo, Y., Yu, J., Gu, R., Solovyev, R., Stempkovskiy, A., Habruseva, T., Sukhovei, M., Mitsufuji, Y., "The Sound Demixing Challenge 2023 - Cinematic Demixing Track", arXiv, August 2023.
BibTeX arXiv
- @article{Uhlich2023aug,
- author = {Uhlich, Stefan and Fabbro, Giorgio and Hirano, Masato and Takahashi, Shusuke and Wichern, Gordon and Le Roux, Jonathan and Chakraborty, Dipam and Mohanty, Sharada and Li, Kai and Luo, Yi and Yu, Jianwei and Gu, Rongzhi and Solovyev, Roman and Stempkovskiy, Alexander and Habruseva, Tatiana and Sukhovei, Mikhail and Mitsufuji, Yuki},
- title = {The Sound Demixing Challenge 2023 - Cinematic Demixing Track},
- journal = {arXiv},
- year = 2023,
- month = aug,
- url = {https://arxiv.org/abs/2308.06981}
- }
Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F., Le Roux, J., Watanabe, S., "BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up," Tech. Rep. TR2023-068, DCASE2023 Challenge, May 2023.
BibTeX TR2023-068 PDF
- @techreport{Wu2023may,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, Francois and Le Roux, Jonathan and Watanabe, Shinji},
- title = {BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up},
- institution = {DCASE2023 Challenge},
- year = 2023,
- month = may,
- url = {https://www.merl.com/publications/TR2023-068}
- }
Chen, K., Wichern, G., Germain, F., Le Roux, J., "Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT", IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB), DOI: 10.1109/ICASSPW59220.2023.10193575, May 2023.
BibTeX TR2023-030 PDF
- @inproceedings{Chen2023may,
- author = {Chen, Ke and Wichern, Gordon and Germain, Francois and Le Roux, Jonathan},
- title = {Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT},
- booktitle = {IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB)},
- year = 2023,
- month = may,
- doi = {10.1109/ICASSPW59220.2023.10193575},
- isbn = {979-8-3503-0261-5},
- url = {https://www.merl.com/publications/TR2023-030}
- }
Aralikatti, R., Boeddeker, C., Wichern, G., Subramanian, A.S., Le Roux, J., "Reverberation as Supervision for Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10095022, May 2023, pp. 1-5.
BibTeX TR2023-016 PDF
- @inproceedings{Aralikatti2023may,
- author = {Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin Shanmugam and Le Roux, Jonathan},
- title = {Reverberation as Supervision for Speech Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10095022},
- url = {https://www.merl.com/publications/TR2023-016}
- }
Bralios, D., Tzinis, E., Wichern, G., Smaragdis, P., Le Roux, J., "Latent Iterative Refinement for Modular Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10096897, May 2023, pp. 1-5.
BibTeX TR2023-019 PDF
- @inproceedings{Bralios2023may,
- author = {Bralios, Dimitrios and Tzinis, Efthymios and Wichern, Gordon and Smaragdis, Paris and Le Roux, Jonathan},
- title = {Latent Iterative Refinement for Modular Source Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10096897},
- url = {https://www.merl.com/publications/TR2023-019}
- }
Petermann, D., Wichern, G., Subramanian, A.S., Le Roux, J., "Hyperbolic Audio Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10094943, May 2023, pp. 1-5.
BibTeX TR2023-017 PDF Software
- @inproceedings{Petermann2023may,
- author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Le Roux, Jonathan},
- title = {Hyperbolic Audio Source Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10094943},
- url = {https://www.merl.com/publications/TR2023-017}
- }