Publications

314 / 3,591 publications found.


  •  Petermann, D., Wichern, G., Wang, Z.-Q., Le Roux, J., "The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746005, April 2022, pp. 526-530.
    BibTeX TR2022-022 PDF Software
    • @inproceedings{Petermann2022apr,
    • author = {Petermann, Darius and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {526--530},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9746005},
    • url = {https://www.merl.com/publications/TR2022-022}
    • }
  •  Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
    BibTeX TR2022-019 PDF
    • @inproceedings{Shah2022apr,
    • author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and Le Roux, Jonathan and Hori, Chiori},
    • title = {Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7732--7736},
    • month = apr,
    • publisher = {IEEE},
    • issn = {1520-6149},
    • isbn = {978-1-6654-0540-9},
    • url = {https://www.merl.com/publications/TR2022-019}
    • }
  •  Slizovskaia, O., Wichern, G., Wang, Z.-Q., Le Roux, J., "Locate This, Not That: Class-Conditioned Sound Event DOA Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747604, April 2022, pp. 711-715.
    BibTeX TR2022-023 PDF
    • @inproceedings{Slizovskaia2022mar,
    • author = {Slizovskaia, Olga and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Locate This, Not That: Class-Conditioned Sound Event DOA Estimation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {711--715},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9747604},
    • url = {https://www.merl.com/publications/TR2022-023}
    • }
  •  Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
    BibTeX TR2022-016 PDF
    • @inproceedings{Hori2022feb,
    • author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Le Roux, Jonathan and Marks, Tim K.},
    • title = {Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-016}
    • }
  •  Shah, A.P., Hori, T., Le Roux, J., Hori, C., DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning, February 2022.
    BibTeX TR2022-025 PDF
    • @book{Shah2022feb,
    • author = {Shah, Ankit Parag and Hori, Takaaki and Le Roux, Jonathan and Hori, Chiori},
    • title = {DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-025}
    • }
  •  Cherian, A., Hori, C., Marks, T.K., Le Roux, J., "(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering", AAAI Conference on Artificial Intelligence, DOI: 10.1609/​aaai.v36i1.19922, February 2022, pp. 444-453.
    BibTeX TR2022-014 PDF Video Presentation
    • @inproceedings{Cherian2022feb,
    • author = {Cherian, Anoop and Hori, Chiori and Marks, Tim K. and Le Roux, Jonathan},
    • title = {(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering},
    • booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
    • year = 2022,
    • pages = {444--453},
    • month = feb,
    • doi = {10.1609/aaai.v36i1.19922},
    • url = {https://www.merl.com/publications/TR2022-014}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2021.3129363, Vol. 29, pp. 3476-3490, December 2021.
    BibTeX TR2021-144 PDF
    • @article{Wang2021dec,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2021,
    • volume = 29,
    • pages = {3476--3490},
    • month = dec,
    • doi = {10.1109/TASLP.2021.3129363},
    • url = {https://www.merl.com/publications/TR2021-144}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "On The Compensation Between Magnitude and Phase in Speech Separation", IEEE Signal Processing Letters, DOI: 10.1109/​LSP.2021.3116502, Vol. 28, pp. 2018-2022, November 2021.
    BibTeX TR2021-137 PDF
    • @article{Wang2021nov2,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {On The Compensation Between Magnitude and Phase in Speech Separation},
    • journal = {IEEE Signal Processing Letters},
    • year = 2021,
    • volume = 28,
    • pages = {2018--2022},
    • month = nov,
    • doi = {10.1109/LSP.2021.3116502},
    • url = {https://www.merl.com/publications/TR2021-137}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Reverberant Speech Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632667, October 2021, pp. 56-60.
    BibTeX TR2021-127 PDF
    • @inproceedings{Wang2021oct4,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Convolutive Prediction for Reverberant Speech Separation},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {56--60},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632667},
    • url = {https://www.merl.com/publications/TR2021-127}
    • }
  •  Wichern, G., Chakrabarty, A., Wang, Z.-Q., Le Roux, J., "Anomalous sound detection using attentive neural processes", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632762, October 2021, pp. 186-190.
    BibTeX TR2021-129 PDF
    • @inproceedings{Wichern2021oct,
    • author = {Wichern, Gordon and Chakrabarty, Ankush and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Anomalous sound detection using attentive neural processes},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {186--190},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632762},
    • url = {https://www.merl.com/publications/TR2021-129}
    • }
  •  Chatterjee, M., Le Roux, J., Ahuja, N., Cherian, A., "Visual Scene Graphs for Audio Source Separation", IEEE International Conference on Computer Vision (ICCV), October 2021, pp. 1204-1213.
    BibTeX TR2021-095 PDF Video Software
    • @inproceedings{Chatterjee2021oct,
    • author = {Chatterjee, Moitreya and Le Roux, Jonathan and Ahuja, Narendra and Cherian, Anoop},
    • title = {Visual Scene Graphs for Audio Source Separation},
    • booktitle = {IEEE International Conference on Computer Vision (ICCV)},
    • year = 2021,
    • pages = {1204--1213},
    • month = oct,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2021-095}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-571, September 2021, pp. 726-730.
    BibTeX TR2021-103 PDF
    • @inproceedings{Higuchi2021sep,
    • author = {Higuchi, Yosuke and Moritz, Niko and Le Roux, Jonathan and Hori, Takaaki},
    • title = {Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {726--730},
    • month = sep,
    • doi = {10.21437/Interspeech.2021-571},
    • url = {https://www.merl.com/publications/TR2021-103}
    • }
  •  Hori, T., Moritz, N., Hori, C., Le Roux, J., "Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1643, August 2021, pp. 2097-2101.
    BibTeX TR2021-100 PDF
    • @inproceedings{Hori2021aug3,
    • author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and Le Roux, Jonathan},
    • title = {Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {2097--2101},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1643},
    • url = {https://www.merl.com/publications/TR2021-100}
    • }
  •  Hori, C., Hori, T., Le Roux, J., "Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1975, August 2021, pp. 586–590.
    BibTeX TR2021-093 PDF
    • @inproceedings{Hori2021aug2,
    • author = {Hori, Chiori and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {586–590},
    • month = aug,
    • publisher = {ISCA},
    • doi = {10.21437/Interspeech.2021-1975},
    • url = {https://www.merl.com/publications/TR2021-093}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-1693, August 2021, pp. 1822-1826.
    BibTeX TR2021-094 PDF
    • @inproceedings{Moritz2021aug,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {1822--1826},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1693},
    • url = {https://www.merl.com/publications/TR2021-094}
    • }
  •  Hori, C., "Human Perspective Scene Understanding via Multimodal Sensing," Tech. Rep. TR2022-151, Audio-Visual Scene Understanding Tutorial at CVPR 2021, June 2021.
    BibTeX TR2022-151 PDF Video
    • @techreport{Hori2021jun,
    • author = {Hori, Chiori},
    • title = {Human Perspective Scene Understanding via Multimodal Sensing},
    • institution = {Mitsubishi Electric Research Laboratories},
    • year = 2021,
    • month = jun,
    • url = {https://www.merl.com/publications/TR2022-151}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Capturing Multi-Resolution Context by Dilated Self-Attention", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9415001, June 2021, pp. 5869-5873.
    BibTeX TR2021-036 PDF
    • @inproceedings{Moritz2021jun,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Capturing Multi-Resolution Context by Dilated Self-Attention},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {5869--5873},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9415001},
    • url = {https://www.merl.com/publications/TR2021-036}
    • }
  •  Hung, Y.-N., Wichern, G., Le Roux, J., "Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9413358, June 2021, pp. 46-50.
    BibTeX TR2021-069 PDF
    • @inproceedings{Hung2021jun,
    • author = {Hung, Yun-Ning and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {46--50},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9413358},
    • issn = {2379-190X},
    • isbn = {978-1-7281-7605-5},
    • url = {https://www.merl.com/publications/TR2021-069}
    • }
  •  Khurana, S., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9414299, June 2021, pp. 6553-6557.
    BibTeX TR2021-039 PDF
    • @inproceedings{Khurana2021jun,
    • author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {6553--6557},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9414299},
    • url = {https://www.merl.com/publications/TR2021-039}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Semi-Supervised Speech Recognition via Graph-Based Temporal Classification", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9414058, June 2021, pp. 6548-6552.
    BibTeX TR2021-037 PDF
    • @inproceedings{Moritz2021jun2,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Semi-Supervised Speech Recognition via Graph-Based Temporal Classification},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {6548--6552},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9414058},
    • url = {https://www.merl.com/publications/TR2021-037}
    • }
  •  Watanabe, S., Boyer, F., Chang, X., Guo, P., Hayashi, T., Higuchi, Y., Hori, T., Huang, W.-C., Inaguma, H., Kamo, N., Shigeki, K., Li, C., Shi, J., Subramanian, A.S., Zhang, W., "The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans", IEEE Data Science and Learning Workshop (DSLW), DOI: 10.1109/​DSLW51110, June 2021, pp. 1-6.
    BibTeX TR2021-073 PDF
    • @inproceedings{Watanabe2021jun,
    • author = {Watanabe, Shinji and Boyer, Florian and Chang, Xuankai and Guo, Pengcheng and Hayashi, Tomoki and Higuchi, Yosuke and Hori, Takaaki and Huang, Wen-Chin and Inaguma, Hirofumi and Kamo, Naoyuki and Shigeki, Karita and Li, Chenda and Shi, Jing and Subramanian, Aswin S and Zhang, Wangyou},
    • title = {The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans},
    • booktitle = {IEEE Data Science and Learning Workshop (DSLW)},
    • year = 2021,
    • pages = {1--6},
    • month = jun,
    • publisher = {IEEE},
    • doi = {10.1109/DSLW51110},
    • isbn = {978-1-6654-2826-2},
    • url = {https://www.merl.com/publications/TR2021-073}
    • }
  •  Kim, S., Galley, M., Gunasekara, C., Lee, S., Atkinson, A., Peng, B., Schulz, H., Gao, J., Li, J., Adada, M., Huang, M., Lastras, L., Kummerfeld, J.K., Lasecki, W.S., Hori, C., Cherian, A., Marks, T.K., Rastogi, A., Zang, X., Sunkara, S., Gupta, R., "Overview of the Eighth Dialog System Technology Challenge: DSTC8", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2021.3078368, May 2021.
    BibTeX TR2021-064 PDF
    • @article{Kim2021may,
    • author = {Kim, Seokhwan and Galley, Michel and Gunasekara, Chulaka and Lee, Sungjin and Atkinson, Adam and Peng, Baolin and Schulz, Hannes and Gao, Jianfeng and Li, Jinchao and Adada, Mahmoud and Huang, Minlie and Lastras, Luis and Kummerfeld, Jonathan K. and Lasecki, Walter S. and Hori, Chiori and Cherian, Anoop and Marks, Tim K. and Rastogi, Abhinav and Zang, Xiaoxue and Sunkara, Srinivas and Gupta, Raghav},
    • title = {Overview of the Eighth Dialog System Technology Challenge: DSTC8},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2021,
    • month = may,
    • doi = {10.1109/TASLP.2021.3078368},
    • issn = {2329-9290},
    • url = {https://www.merl.com/publications/TR2021-064}
    • }
  •  Hori, C., Tsuchiya, M., Chen, S., Cherian, A., Hori, T., Harsham, B.A., Marks, T.K., Le Roux, J., Sullivan, A., Vetro, A., "マルチモーダルセンシング情報に基づくScene-aware Interaction 技術", Society of Automotive Engineers of Japan, Vol. 75, No. 5, pp. 66-71, May 2021.
    BibTeX TR2021-042 PDF Video
    • @article{Hori2021may,
    • author = {Hori, Chiori and Tsuchiya, Masato and Chen, Siheng and Cherian, Anoop and Hori, Takaaki and Harsham, Bret A. and Marks, Tim K. and Le Roux, Jonathan and Sullivan, Alan and Vetro, Anthony},
    • title = {マルチモーダルセンシング情報に基づくScene-aware Interaction 技術},
    • journal = {Society of Automotive Engineers of Japan},
    • year = 2021,
    • volume = 75,
    • number = 5,
    • pages = {66--71},
    • month = may,
    • url = {https://www.merl.com/publications/TR2021-042}
    • }
  •  Hori, T., Moritz, N., Hori, C., Le Roux, J., "Transformer-based Long-context End-to-end Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2020-2928, October 2020, pp. 5011-5015.
    BibTeX TR2020-139 PDF Presentation
    • @inproceedings{Hori2020oct,
    • author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and Le Roux, Jonathan},
    • title = {Transformer-based Long-context End-to-end Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2020,
    • pages = {5011--5015},
    • month = oct,
    • doi = {10.21437/Interspeech.2020-2928},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2020-139}
    • }
  •  Jayashankar, T., Le Roux, J., Moulin, P., "Detecting Audio Attacks on ASR Systems with Dropout Uncertainty", Interspeech, DOI: 10.21437/​Interspeech.2020-1846, October 2020, pp. 4671-4675.
    BibTeX TR2020-137 PDF Presentation
    • @inproceedings{Jayashankar2020oct,
    • author = {Jayashankar, Tejas and Le Roux, Jonathan and Moulin, Pierre},
    • title = {Detecting Audio Attacks on ASR Systems with Dropout Uncertainty},
    • booktitle = {Interspeech},
    • year = 2020,
    • pages = {4671--4675},
    • month = oct,
    • doi = {10.21437/Interspeech.2020-1846},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2020-137}
    • }