Publications

160 / 3,738 publications found.


  •  Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
    BibTeX TR2022-019 PDF
    • @inproceedings{Shah2022apr,
    • author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and Le Roux, Jonathan and Hori, Chiori},
    • title = {Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7732--7736},
    • month = apr,
    • publisher = {IEEE},
    • issn = {1520-6149},
    • isbn = {978-1-6654-0540-9},
    • url = {https://www.merl.com/publications/TR2022-019}
    • }
  •  Slizovskaia, O., Wichern, G., Wang, Z.-Q., Le Roux, J., "Locate This, Not That: Class-Conditioned Sound Event DOA Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747604, April 2022, pp. 711-715.
    BibTeX TR2022-023 PDF
    • @inproceedings{Slizovskaia2022mar,
    • author = {Slizovskaia, Olga and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Locate This, Not That: Class-Conditioned Sound Event DOA Estimation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {711--715},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9747604},
    • url = {https://www.merl.com/publications/TR2022-023}
    • }
  •  Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
    BibTeX TR2022-016 PDF
    • @inproceedings{Hori2022feb,
    • author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Le Roux, Jonathan and Marks, Tim K.},
    • title = {Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-016}
    • }
  •  Shah, A.P., Hori, T., Le Roux, J., Hori, C., "DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning", The 10th Dialog System Technology Challenge Workshop at AAAI 2022, February 2022.
    BibTeX TR2022-025 PDF
    • @inproceedings{Shah2022feb,
    • author = {{Shah, Ankit Parag and Hori, Takaaki and Le Roux, Jonathan and Hori, Chiori}},
    • title = {DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI 2022},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-025}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識", Acoustical Society of Japan Spring Meeting (ASJ), February 2022.
    BibTeX
    • @inproceedings{Higuchi2022feb,
    • author = {Higuchi, Yosuke and Moritz, Niko and Le Roux, Jonathan and Hori, Takaaki},
    • title = {Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識},
    • booktitle = {Acoustical Society of Japan Spring Meeting (ASJ)},
    • year = 2022,
    • month = feb
    • }
  •  Cherian, A., Hori, C., Marks, T.K., Le Roux, J., "(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering", AAAI Conference on Artificial Intelligence, DOI: 10.1609/​aaai.v36i1.19922, February 2022, pp. 444-453.
    BibTeX TR2022-014 PDF Video Presentation
    • @inproceedings{Cherian2022feb,
    • author = {Cherian, Anoop and Hori, Chiori and Marks, Tim K. and Le Roux, Jonathan},
    • title = {(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering},
    • booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
    • year = 2022,
    • pages = {444--453},
    • month = feb,
    • doi = {10.1609/aaai.v36i1.19922},
    • url = {https://www.merl.com/publications/TR2022-014}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2021.3129363, Vol. 29, pp. 3476-3490, December 2021.
    BibTeX TR2021-144 PDF
    • @article{Wang2021dec,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2021,
    • volume = 29,
    • pages = {3476--3490},
    • month = dec,
    • doi = {10.1109/TASLP.2021.3129363},
    • url = {https://www.merl.com/publications/TR2021-144}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "On The Compensation Between Magnitude and Phase in Speech Separation", IEEE Signal Processing Letters, DOI: 10.1109/​LSP.2021.3116502, Vol. 28, pp. 2018-2022, November 2021.
    BibTeX TR2021-137 PDF
    • @article{Wang2021nov2,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {On The Compensation Between Magnitude and Phase in Speech Separation},
    • journal = {IEEE Signal Processing Letters},
    • year = 2021,
    • volume = 28,
    • pages = {2018--2022},
    • month = nov,
    • doi = {10.1109/LSP.2021.3116502},
    • url = {https://www.merl.com/publications/TR2021-137}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Reverberant Speech Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632667, October 2021, pp. 56-60.
    BibTeX TR2021-127 PDF
    • @inproceedings{Wang2021oct4,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Convolutive Prediction for Reverberant Speech Separation},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {56--60},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632667},
    • url = {https://www.merl.com/publications/TR2021-127}
    • }
  •  Wichern, G., Chakrabarty, A., Wang, Z.-Q., Le Roux, J., "Anomalous sound detection using attentive neural processes", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632762, October 2021, pp. 186-190.
    BibTeX TR2021-129 PDF
    • @inproceedings{Wichern2021oct,
    • author = {Wichern, Gordon and Chakrabarty, Ankush and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Anomalous sound detection using attentive neural processes},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {186--190},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632762},
    • url = {https://www.merl.com/publications/TR2021-129}
    • }
  •  Chatterjee, M., Le Roux, J., Ahuja, N., Cherian, A., "Visual Scene Graphs for Audio Source Separation", IEEE International Conference on Computer Vision (ICCV), October 2021, pp. 1204-1213.
    BibTeX TR2021-095 PDF Video Software
    • @inproceedings{Chatterjee2021oct,
    • author = {Chatterjee, Moitreya and Le Roux, Jonathan and Ahuja, Narendra and Cherian, Anoop},
    • title = {Visual Scene Graphs for Audio Source Separation},
    • booktitle = {IEEE International Conference on Computer Vision (ICCV)},
    • year = 2021,
    • pages = {1204--1213},
    • month = oct,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2021-095}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement", arXiv, October 2021.
    BibTeX arXiv
    • @article{Wang2021oct,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement},
    • journal = {arXiv},
    • year = 2021,
    • month = oct,
    • url = {https://arxiv.org/abs/2110.00570}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-571, September 2021, pp. 726-730.
    BibTeX TR2021-103 PDF
    • @inproceedings{Higuchi2021sep,
    • author = {Higuchi, Yosuke and Moritz, Niko and Le Roux, Jonathan and Hori, Takaaki},
    • title = {Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {726--730},
    • month = sep,
    • doi = {10.21437/Interspeech.2021-571},
    • url = {https://www.merl.com/publications/TR2021-103}
    • }
  •  Hori, T., Moritz, N., Hori, C., Le Roux, J., "Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1643, August 2021, pp. 2097-2101.
    BibTeX TR2021-100 PDF
    • @inproceedings{Hori2021aug3,
    • author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and Le Roux, Jonathan},
    • title = {Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {2097--2101},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1643},
    • url = {https://www.merl.com/publications/TR2021-100}
    • }
  •  Hori, C., Hori, T., Le Roux, J., "Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1975, August 2021, pp. 586–590.
    BibTeX TR2021-093 PDF
    • @inproceedings{Hori2021aug2,
    • author = {Hori, Chiori and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {586–590},
    • month = aug,
    • publisher = {ISCA},
    • doi = {10.21437/Interspeech.2021-1975},
    • url = {https://www.merl.com/publications/TR2021-093}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-1693, August 2021, pp. 1822-1826.
    BibTeX TR2021-094 PDF
    • @inproceedings{Moritz2021aug,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {1822--1826},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1693},
    • url = {https://www.merl.com/publications/TR2021-094}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Capturing Multi-Resolution Context by Dilated Self-Attention", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9415001, June 2021, pp. 5869-5873.
    BibTeX TR2021-036 PDF
    • @inproceedings{Moritz2021jun,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Capturing Multi-Resolution Context by Dilated Self-Attention},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {5869--5873},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9415001},
    • url = {https://www.merl.com/publications/TR2021-036}
    • }
  •  Hung, Y.-N., Wichern, G., Le Roux, J., "Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9413358, June 2021, pp. 46-50.
    BibTeX TR2021-069 PDF
    • @inproceedings{Hung2021jun,
    • author = {Hung, Yun-Ning and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {46--50},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9413358},
    • issn = {2379-190X},
    • isbn = {978-1-7281-7605-5},
    • url = {https://www.merl.com/publications/TR2021-069}
    • }
  •  Khurana, S., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9414299, June 2021, pp. 6553-6557.
    BibTeX TR2021-039 PDF
    • @inproceedings{Khurana2021jun,
    • author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {6553--6557},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9414299},
    • url = {https://www.merl.com/publications/TR2021-039}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Semi-Supervised Speech Recognition via Graph-Based Temporal Classification", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9414058, June 2021, pp. 6548-6552.
    BibTeX TR2021-037 PDF
    • @inproceedings{Moritz2021jun2,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Semi-Supervised Speech Recognition via Graph-Based Temporal Classification},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {6548--6552},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9414058},
    • url = {https://www.merl.com/publications/TR2021-037}
    • }
  •  Hori, C., Tsuchiya, M., Chen, S., Cherian, A., Hori, T., Harsham, B.A., Marks, T.K., Le Roux, J., Sullivan, A., Vetro, A., "マルチモーダルセンシング情報に基づくScene-aware Interaction 技術", Society of Automotive Engineers of Japan, Vol. 75, No. 5, pp. 66-71, May 2021.
    BibTeX TR2021-042 PDF Video
    • @article{Hori2021may,
    • author = {Hori, Chiori and Tsuchiya, Masato and Chen, Siheng and Cherian, Anoop and Hori, Takaaki and Harsham, Bret A. and Marks, Tim K. and Le Roux, Jonathan and Sullivan, Alan and Vetro, Anthony},
    • title = {マルチモーダルセンシング情報に基づくScene-aware Interaction 技術},
    • journal = {Society of Automotive Engineers of Japan},
    • year = 2021,
    • volume = 75,
    • number = 5,
    • pages = {66--71},
    • month = may,
    • url = {https://www.merl.com/publications/TR2021-042}
    • }
  •  Geng, S., Gao, P., Chatterjee, M., Hori, C., Le Roux, J., Zhang, Y., Li, H., Cherian, A., "Dynamic Graph Representation Learning for Video Dialog via Multi-Modal Shuffled Transformers", AAAI Conference on Artificial Intelligence, February 2021, pp. 1415-1423.
    BibTeX TR2021-010 PDF
    • @inproceedings{Geng2021feb,
    • author = {Geng, Shijie and Gao, Peng and Chatterjee, Moitreya and Hori, Chiori and Le Roux, Jonathan and Zhang, Yongfeng and Li, Hongsheng and Cherian, Anoop},
    • title = {Dynamic Graph Representation Learning for Video Dialog via Multi-Modal Shuffled Transformers},
    • booktitle = {AAAI Conference on Artificial Intelligence},
    • year = 2021,
    • pages = {1415--1423},
    • month = feb,
    • publisher = {AAAI Press, Palo Alto, California USA},
    • isbn = {978-1-57735-866-4},
    • url = {https://www.merl.com/publications/TR2021-010}
    • }
  •  Hori, T., Moritz, N., Hori, C., Le Roux, J., "Transformer-based Long-context End-to-end Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2020-2928, October 2020, pp. 5011-5015.
    BibTeX TR2020-139 PDF Presentation
    • @inproceedings{Hori2020oct,
    • author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and Le Roux, Jonathan},
    • title = {Transformer-based Long-context End-to-end Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2020,
    • pages = {5011--5015},
    • month = oct,
    • doi = {10.21437/Interspeech.2020-2928},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2020-139}
    • }
  •  Jayashankar, T., Le Roux, J., Moulin, P., "Detecting Audio Attacks on ASR Systems with Dropout Uncertainty", Interspeech, DOI: 10.21437/​Interspeech.2020-1846, October 2020, pp. 4671-4675.
    BibTeX TR2020-137 PDF Presentation
    • @inproceedings{Jayashankar2020oct,
    • author = {Jayashankar, Tejas and Le Roux, Jonathan and Moulin, Pierre},
    • title = {Detecting Audio Attacks on ASR Systems with Dropout Uncertainty},
    • booktitle = {Interspeech},
    • year = 2020,
    • pages = {4671--4675},
    • month = oct,
    • doi = {10.21437/Interspeech.2020-1846},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2020-137}
    • }
  •  Moritz, N., Wichern, G., Hori, T., Le Roux, J., "All-in-One Transformer: Unifying Speech Recognition, Audio Tagging, and Event Detection", Interspeech, DOI: 10.21437/​Interspeech.2020-2757, October 2020, pp. 3112-3116.
    BibTeX TR2020-138 PDF Presentation
    • @inproceedings{Moritz2020oct,
    • author = {Moritz, Niko and Wichern, Gordon and Hori, Takaaki and Le Roux, Jonathan},
    • title = {All-in-One Transformer: Unifying Speech Recognition, Audio Tagging, and Event Detection},
    • booktitle = {Interspeech},
    • year = 2020,
    • pages = {3112--3116},
    • month = oct,
    • doi = {10.21437/Interspeech.2020-2757},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2020-138}
    • }