Publications

Liu, X., Paul, S., Chatterjee, M., Cherian, A., "CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments", AAAI Conference on Artificial Intelligence, DOI: 10.1609/aaai.v38i4.28167, December 2023, pp. 3765-3773.
BibTeX TR2023-154 PDF
- @inproceedings{Liu2023dec2,
- author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
- title = {{CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments}},
- booktitle = {Proceedings of the 38th AAAI Conference on Artificial Intelligence},
- year = 2023,
- pages = {3765--3773},
- month = dec,
- doi = {10.1609/aaai.v38i4.28167},
- url = {https://www.merl.com/publications/TR2023-154}
- }
Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU57964.2023.10389618, December 2023.
BibTeX TR2023-152 PDF Video
- @inproceedings{Pan2023dec2,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2023,
- month = dec,
- doi = {10.1109/ASRU57964.2023.10389618},
- isbn = {979-8-3503-0689-7},
- url = {https://www.merl.com/publications/TR2023-152}
- }
He, Y., Shin, S., Cherian, A., Markham, A., Trigon, N., "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2023, pp. 5496-5507.
BibTeX TR2023-144 PDF
- @inproceedings{He2023dec,
- author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Markham, Andrew and Trigon, Niki},
- title = {{Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2023,
- pages = {5496--5507},
- month = dec,
- url = {https://www.merl.com/publications/TR2023-144}
- }
Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks", International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero), November 2023.
BibTeX TR2023-141 PDF
- @inproceedings{Wu2023nov,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and {Le Roux}, Jonathan and Watanabe, Shinji},
- title = {{On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks}},
- booktitle = {International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero)},
- year = 2023,
- month = nov,
- url = {https://www.merl.com/publications/TR2023-141}
- }
Falcon Perez, R., Wichern, G., Germain, F., Le Roux, J., "Location as supervision for weakly supervised multi-channel source separation of machine sounds", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA58266.2023.10248128, September 2023.
BibTeX TR2023-119 PDF Presentation
- @inproceedings{FalconPerez2023aug,
- author = {Falcon Perez, Ricardo and Wichern, Gordon and Germain, Francois and {Le Roux}, Jonathan},
- title = {{Location as supervision for weakly supervised multi-channel source separation of machine sounds}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2023,
- month = sep,
- publisher = {IEEE},
- doi = {10.1109/WASPAA58266.2023.10248128},
- issn = {1947-1629},
- isbn = {979-8-3503-2372-6},
- url = {https://www.merl.com/publications/TR2023-119}
- }
Germain, F., Wichern, G., Le Roux, J., "Hyperbolic Unsupervised Anomalous Sound Detection", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA58266.2023.10248092, September 2023.
BibTeX TR2023-108 PDF Video Presentation
- @inproceedings{Germain2023aug,
- author = {Germain, Francois and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Hyperbolic Unsupervised Anomalous Sound Detection}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2023,
- month = sep,
- publisher = {IEEE},
- doi = {10.1109/WASPAA58266.2023.10248092},
- issn = {1947-1629},
- isbn = {979-8-3503-2372-6},
- url = {https://www.merl.com/publications/TR2023-108}
- }
Petermann, D., Wichern, G., Subramanian, A.S., Wang, Z.-Q., Le Roux, J., "Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2023.3290428, Vol. 31, pp. 2592-2605, September 2023.
BibTeX TR2023-113 PDF
- @article{Petermann2023sep,
- author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
- title = {{Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks}},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2023,
- volume = 31,
- pages = {2592--2605},
- month = sep,
- doi = {10.1109/TASLP.2023.3290428},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2023-113}
- }
Hori, C., Peng, P., Harwath, D., Liu, X., Ota, K., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos", Interspeech, DOI: 10.21437/Interspeech.2023-1983, August 2023, pp. 4663-4667.
BibTeX TR2023-104 PDF
- @inproceedings{Hori2023aug,
- author = {Hori, Chiori and Peng, Puyuang and Harwath, David and Liu, Xinyu and Ota, Kei and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos}},
- booktitle = {Interspeech},
- year = 2023,
- pages = {4663--4667},
- month = aug,
- doi = {10.21437/Interspeech.2023-1983},
- url = {https://www.merl.com/publications/TR2023-104}
- }
Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F., Le Roux, J., Watanabe, S., "BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up," Tech. Rep. TR2023-068, DCASE2023 Challenge, May 2023.
BibTeX TR2023-068 PDF
- @techreport{Wu2023may,
- author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, Francois and {Le Roux}, Jonathan and Watanabe, Shinji},
- title = {{BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up}},
- institution = {DCASE2023 Challenge},
- year = 2023,
- month = may,
- url = {https://www.merl.com/publications/TR2023-068}
- }
Zhang, J., Cherian, A., Liu, Y., Shabat, I.B., Rodriguez, C., Gould, S., "Aligning Step-by-Step Instructional Diagrams to Video Demonstrations", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), May 2023, pp. 2483-2492.
BibTeX TR2023-034 PDF
- @inproceedings{Zhang2023may,
- author = {Zhang, Jiahao and Cherian, Anoop and Liu, Yanbin and Shabat, Itzik Ben and Rodriguez, Cristian and Gould, Stephen},
- title = {{Aligning Step-by-Step Instructional Diagrams to Video Demonstrations}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2023,
- pages = {2483--2492},
- month = may,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2023-034}
- }
Chen, K., Wichern, G., Germain, F., Le Roux, J., "Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT", IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB), DOI: 10.1109/ICASSPW59220.2023.10193575, May 2023.
BibTeX TR2023-030 PDF
- @inproceedings{Chen2023may,
- author = {Chen, Ke and Wichern, Gordon and Germain, Francois and {Le Roux}, Jonathan},
- title = {{Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT}},
- booktitle = {IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB)},
- year = 2023,
- month = may,
- doi = {10.1109/ICASSPW59220.2023.10193575},
- isbn = {979-8-3503-0261-5},
- url = {https://www.merl.com/publications/TR2023-030}
- }
Aralikatti, R., Boeddeker, C., Wichern, G., Subramanian, A.S., Le Roux, J., "Reverberation as Supervision for Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10095022, May 2023, pp. 1-5.
BibTeX TR2023-016 PDF
- @inproceedings{Aralikatti2023may,
- author = {Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
- title = {{Reverberation as Supervision for Speech Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10095022},
- url = {https://www.merl.com/publications/TR2023-016}
- }
Bralios, D., Tzinis, E., Wichern, G., Smaragdis, P., Le Roux, J., "Latent Iterative Refinement for Modular Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10096897, May 2023, pp. 1-5.
BibTeX TR2023-019 PDF
- @inproceedings{Bralios2023may,
- author = {Bralios, Dimitrios and Tzinis, Efthymios and Wichern, Gordon and Smaragdis, Paris and {Le Roux}, Jonathan},
- title = {{Latent Iterative Refinement for Modular Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10096897},
- url = {https://www.merl.com/publications/TR2023-019}
- }
Petermann, D., Wichern, G., Subramanian, A.S., Le Roux, J., "Hyperbolic Audio Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10094943, May 2023, pp. 1-5.
BibTeX TR2023-017 PDF Video Software
- @inproceedings{Petermann2023may,
- author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
- title = {{Hyperbolic Audio Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10094943},
- url = {https://www.merl.com/publications/TR2023-017}
- }
Tzinis, E., Wichern, G., Smaragdis, P., Le Roux, J., "Optimal Condition Training for Target Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10095128, May 2023, pp. 1-5.
BibTeX TR2023-018 PDF
- @inproceedings{Tzinis2023may,
- author = {Tzinis, Efthymios and Wichern, Gordon and Smaragdis, Paris and {Le Roux}, Jonathan},
- title = {{Optimal Condition Training for Target Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10095128},
- url = {https://www.merl.com/publications/TR2023-018}
- }
Yen, H., Germain, F., Wichern, G., Le Roux, J., "Cold Diffusion for Speech Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10096064, May 2023, pp. 1-5.
BibTeX TR2023-020 PDF
- @inproceedings{Yen2023may,
- author = {Yen, Hao and Germain, Francois and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Cold Diffusion for Speech Enhancement}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10096064},
- url = {https://www.merl.com/publications/TR2023-020}
- }
Yoshino, K., Chen, Y.-N., Crook, P., Kottur, S., Li, J., Hedayatnia, B., Moon, S., Fei, Z., Li, Z., Zhang, J., Feng, Y., Zhou, J., Kim, S., Liu, Y., Jin, D., Papangelis, A., Gopalakrishnan, K., Hakkani-Tur, D., Damavandi, B., Geramifard, A., Hori, C., Shah, A., Zhang, C., Li, H., Sedoc, J., D'Haro, L.F., Banchs, R., Rudnicky, A., "Overview of the Tenth Dialog System Technology Challenge: DSTC10", IEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TCSVT.2002.808437, Vol. 13, No. 2, pp. 121-130, February 2023.
BibTeX TR2023-109 PDF
- @article{Yoshino2023feb,
- author = {{{Yoshino, Koichiro and Chen, Yun-Nung and Crook, Paul and Kottur, Satwik and Li, Jinchao and Hedayatnia, Behnam and Moon, Seungwhan and Fei, Zhengcong and Li, Zekang and Zhang, Jinchao and Feng, Yang and Zhou, Jie and Kim, Seokhwan and Liu, Yang and Jin, Di and Papangelis, Alexandros and Gopalakrishnan, Karthik and Hakkani-Tur, Dilek and Damavandi, Babak and Geramifard, Alborz and Hori, Chiori and Shah, Ankit and Zhang, Chen and Li, Haizhou and Sedoc, João and D'Haro, Luis F. and Banchs, Rafael and Rudnicky, Alexander}}},
- title = {{{Overview of the Tenth Dialog System Technology Challenge: DSTC10}}},
- journal = {IEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2023,
- volume = 13,
- number = 2,
- pages = {121--130},
- month = feb,
- doi = {10.1109/TCSVT.2002.808437},
- url = {https://www.merl.com/publications/TR2023-109}
- }
Wang, Z.-Q., Wichern, G., Watanabe, S., Le Roux, J., "STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2022.3224285, Vol. 31, pp. 397-410, December 2022.
BibTeX TR2022-166 PDF
- @article{Wang2022dec2,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency}},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2022,
- volume = 31,
- pages = {397--410},
- month = dec,
- doi = {10.1109/TASLP.2022.3224285},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2022-166}
- }
Venkatesh, S., Wichern, G., Subramanian, A.S., Le Roux, J., "Improved Domain Generalization via Disentangled Multi-Task Learning in Unsupervised Anomalous Sound Detection", DCASE Workshop, Lagrange, M. and Mesaros, A. and Pellegrini, T. and Richard, G. and Serizel, R. and Stowell, D., Eds., November 2022.
BibTeX TR2022-146 PDF Presentation
- @inproceedings{Venkatesh2022nov,
- author = {Venkatesh, Satvik and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
- title = {{Improved Domain Generalization via Disentangled Multi-Task Learning in Unsupervised Anomalous Sound Detection}},
- booktitle = {DCASE Workshop},
- year = 2022,
- editor = {Lagrange, M. and Mesaros, A. and Pellegrini, T. and Richard, G. and Serizel, R. and Stowell, D.},
- month = nov,
- isbn = {978-952-03-2677-7},
- url = {https://www.merl.com/publications/TR2022-146}
- }
Chatterjee, M., Ahuja, N., Cherian, A., "Learning Audio-Visual Dynamics Using Scene Graphs for Audio Source Separation", Advances in Neural Information Processing Systems (NeurIPS), November 2022.
BibTeX TR2022-140 PDF Presentation
- @inproceedings{Chatterjee2022nov,
- author = {Chatterjee, Moitreya and Ahuja, Narendra and Cherian, Anoop},
- title = {{Learning Audio-Visual Dynamics Using Scene Graphs for Audio Source Separation}},
- booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
- year = 2022,
- month = nov,
- url = {https://www.merl.com/publications/TR2022-140}
- }
Paul, S., Roy Chowdhury, A.K., Cherian, A., "AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments", Advances in Neural Information Processing Systems (NeurIPS), October 2022, pp. 6236-6249.
BibTeX TR2022-131 PDF Video Data Software
- @inproceedings{Paul2022oct2,
- author = {Paul, Sudipta and Roy Chowdhury, Amit K and Cherian, Anoop},
- title = {{AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments}},
- booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
- year = 2022,
- pages = {6236--6249},
- month = oct,
- url = {https://www.merl.com/publications/TR2022-131}
- }
Hori, C., Hori, T., Le Roux, J., "Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers", Interspeech, DOI: 10.21437/Interspeech.2022-10891, September 2022, pp. 4511-4515.
BibTeX TR2022-116 PDF
- @inproceedings{Hori2022sep,
- author = {Hori, Chiori and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers}},
- booktitle = {Interspeech},
- year = 2022,
- pages = {4511--4515},
- month = sep,
- doi = {10.21437/Interspeech.2022-10891},
- url = {https://www.merl.com/publications/TR2022-116}
- }
Tzinis, E., Wichern, G., Subramanian, A.S., Smaragdis, P., Le Roux, J., "Heterogeneous Target Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2022-10717, September 2022, pp. 1796-1800.
BibTeX TR2022-115 PDF Video Presentation
- @inproceedings{Tzinis2022sep,
- author = {Tzinis, Efthymios and Wichern, Gordon and Subramanian, Aswin Shanmugam and Smaragdis, Paris and {Le Roux}, Jonathan},
- title = {{Heterogeneous Target Speech Separation}},
- booktitle = {Interspeech},
- year = 2022,
- pages = {1796--1800},
- month = sep,
- doi = {10.21437/Interspeech.2022-10717},
- url = {https://www.merl.com/publications/TR2022-115}
- }
Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2022.3195367, Vol. 16, No. 6, pp. 1424-1438, September 2022.
BibTeX TR2022-112 PDF
- @article{Higuchi2022sep,
- author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
- title = {{Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2022,
- volume = 16,
- number = 6,
- pages = {1424--1438},
- month = sep,
- doi = {10.1109/JSTSP.2022.3195367},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2022-112}
- }
Venkatesh, S., Wichern, G., Subramanian, A.S., Le Roux, J., "Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection," Tech. Rep. TR2022-092, Detection and Classification of Acoustic Scenes and Events (DCASE) Challenge 2022, July 2022.
BibTeX TR2022-092 PDF Presentation
- @techreport{Venkatesh2022jul,
- author = {Venkatesh, Satvik and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
- title = {{Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection}},
- institution = {DCASE2022 Challenge},
- year = 2022,
- month = jul,
- url = {https://www.merl.com/publications/TR2022-092}
- }