Publications

245 / 2,975 publications found.


  •  Pishdadian, F., Wichern, G., Le Roux, J., "Finding Strength in Weakness: Learning to Separate Sounds with Weak Supervision", IEEE/ACM Transactions on Audio, Speech, and Language Processing, September 2020.
    BibTeX TR2020-126 PDF
    • @article{Pishdadian2020sep,
    • author = {Pishdadian, Fatemeh and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Finding Strength in Weakness: Learning to Separate Sounds with Weak Supervision},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2020,
    • month = sep,
    • url = {https://www.merl.com/publications/TR2020-126}
    • }
  •  Seetharaman, P., Wichern, G., Le Roux, J., Pardo, B., "Bootstrapping Unsupervised Deep Music Separation from Primitive Auditory Grouping Principles", ICML 2020 Workshop on Self-supervision in Audio and Speech, July 2020.
    BibTeX TR2020-111 PDF
    • @inproceedings{Seetharaman2020jul,
    • author = {Seetharaman, Prem and Wichern, Gordon and Le Roux, Jonathan and Pardo, Bryan},
    • title = {Bootstrapping Unsupervised Deep Music Separation from Primitive Auditory Grouping Principles},
    • booktitle = {ICML 2020 Workshop on Self-supervision in Audio and Speech},
    • year = 2020,
    • month = jul,
    • url = {https://www.merl.com/publications/TR2020-111}
    • }
  •  Chang, X., Zhang, W., Qian, Y., Le Roux, J., Watanabe, S., "End-To-End Multi-Speaker Speech Recognition with Transformer", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9054029, April 2020, pp. 6134-6138.
    BibTeX TR2020-043 PDF Video
    • @inproceedings{Chang2020apr,
    • author = {Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {End-To-End Multi-Speaker Speech Recognition with Transformer},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {6134--6138},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9054029},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-043}
    • }
  •  Pishdadian, F., Wichern, G., Le Roux, J., "Learning to Separate Sounds From Weakly Labeled Scenes", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9053055, April 2020, pp. 91-95.
    BibTeX TR2020-038 PDF Video
    • @inproceedings{Pishdadian2020apr,
    • author = {Pishdadian, Fatemeh and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Learning to Separate Sounds From Weakly Labeled Scenes},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {91--95},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9053055},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-038}
    • }
  •  Maciejewski, M., Wichern, G., McQuinn, E., Le Roux, J., "WHAMR!: Noisy and Reverberant Single-Channel Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9053327, April 2020, pp. 696-700.
    BibTeX TR2020-042 PDF Video
    • @inproceedings{Maciejewski2020apr,
    • author = {Maciejewski, Matthew and Wichern, Gordon and McQuinn, Emmett and Le Roux, Jonathan},
    • title = {WHAMR!: Noisy and Reverberant Single-Channel Speech Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {696--700},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9053327},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-042}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Streaming Automatic Speech Recognition With The Transformer Model", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9054476, April 2020, pp. 6074-6078.
    BibTeX TR2020-040 PDF Video
    • @inproceedings{Moritz2020apr,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Streaming Automatic Speech Recognition With The Transformer Model},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {6074--6078},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9054476},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-040}
    • }
  •  Sari, L., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9054249, April 2020, pp. 7384-7388.
    BibTeX TR2020-037 PDF Video
    • @inproceedings{Sari2020apr,
    • author = {Sari, Leda and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {7384--7388},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9054249},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-037}
    • }
  •  Shi, L., Geng, S., Shuang, K., Hori, C., Liu, S., Gao, P., Su, S., "Multi-Layer Content Interaction Through Quaternion Product For Visual Question Answering", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9053595, April 2020, pp. 4412-4416.
    BibTeX TR2020-046 PDF
    • @inproceedings{Shi2020apr,
    • author = {Shi, Lei and Geng, Shijie and Shuang, Kai and Hori, Chiori and Liu, Songxiang and Gao, Peng and Su, Sen},
    • title = {Multi-Layer Content Interaction Through Quaternion Product For Visual Question Answering},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {4412--4416},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9053595},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-046}
    • }
  •  Li, R., Wang, X., Mallidi, H., Watanabe, S., Hori, T., Hermansky, H., "Multi-Stream End-to-End Speech Recognition", IEEE/ACM Transactions on Audio, Speech and Language Processing, DOI: 10.1109/TASLP.2019.2959721, Vol. 28, pp. 646-655, March 2020.
    BibTeX TR2020-030 PDF
    • @article{Li2020mar,
    • author = {Li, Ruizhi and Wang, Xiaofei and Mallidi, Harish and Watanabe, Shinji and Hori, Takaaki and Hermansky, Hynek},
    • title = {Multi-Stream End-to-End Speech Recognition},
    • journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
    • year = 2020,
    • volume = 28,
    • pages = {646--655},
    • month = mar,
    • doi = {10.1109/TASLP.2019.2959721},
    • url = {https://www.merl.com/publications/TR2020-030}
    • }
  •  D’Haro, L.F., Yoshino, K., Hori, C., Marks, T., Polymenakos, L., Kummerfeld, J.K., Galley, M., Gao, X., "Overview of the seventh Dialog System Technology Challenge: DSTC7", Computer Speech and Language, DOI: 10.1016/j.csl.2020.101068, Vol. 62, March 2020.
    BibTeX TR2020-029 PDF
    • @article{D’Haro2020mar,
    • author = {D’Haro, Luis, Fernando and Yoshino, Koichiro and Hori, Chiori and Marks, Tim and Polymenakos, Lazaros and Kummerfeld, Jonathan, k. and Galley, Michel and Gao, Xiang},
    • title = {Overview of the seventh Dialog System Technology Challenge: DSTC7},
    • journal = {Computer Speech and Language},
    • year = 2020,
    • volume = 62,
    • month = mar,
    • doi = {10.1016/j.csl.2020.101068},
    • url = {https://www.merl.com/publications/TR2020-029}
    • }
  •  Chang, X., Zhang, W., Qian, Y., Le Roux, J., Watanabe, S., "MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 237-144.
    BibTeX TR2019-157 PDF
    • @inproceedings{Chang2019dec,
    • author = {Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2019,
    • pages = {237--144},
    • month = dec,
    • isbn = {978-1-7281-0305-1},
    • url = {https://www.merl.com/publications/TR2019-157}
    • }
  •  Karita, S., Chen, N., Hayashi, T., Hori, T., Inaguma, H., Jiang, Z., Someki, M., Enrique Yalta Soplin, N., Yamamoto, R., Wang, X., Watanabe, S., Yoshimura, T., Zhang, W., "A Comparative Study on Transformer Vs RNN in Speech Applications", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU46091.2019.9003750, December 2019, pp. 449-456.
    BibTeX TR2019-158 PDF
    • @inproceedings{Karita2019dec,
    • author = {Karita, Shigeki and Chen, Nanxin and Hayashi, Tomoki and Hori, Takaaki and Inaguma, Hirofumi and Jiang, Ziyan and Someki, Masao and Enrique Yalta Soplin, Nelson and Yamamoto, Ryuichi and Wang, Xiaofei and Watanabe, Shinji and Yoshimura, Takenori and Zhang, Wangyou},
    • title = {A Comparative Study on Transformer Vs RNN in Speech Applications},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2019,
    • pages = {449--456},
    • month = dec,
    • doi = {10.1109/ASRU46091.2019.9003750},
    • url = {https://www.merl.com/publications/TR2019-158}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 936-943.
    BibTeX TR2019-159 PDF
    • @inproceedings{Moritz2019dec,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2019,
    • pages = {936--943},
    • month = dec,
    • isbn = {978-1-7281-0305-1},
    • url = {https://www.merl.com/publications/TR2019-159}
    • }
  •  Kavalerov, I., Wisdom, S., Erdogan, H., Patton, B., Wilson, K., Le Roux, J., Hershey, J., "Universal Sound Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937253, October 2019, pp. 170-174.
    BibTeX TR2019-123 PDF
    • @inproceedings{Kavalerov2019oct,
    • author = {Kavalerov, Ilya and Wisdom, Scott and Erdogan, Hakan and Patton, Brian and Wilson, Kevin and Le Roux, Jonathan and Hershey, John},
    • title = {Universal Sound Separation},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2019,
    • pages = {170--174},
    • month = oct,
    • doi = {10.1109/WASPAA.2019.8937253},
    • issn = {1947-1629},
    • isbn = {978-1-7281-1123-0},
    • url = {https://www.merl.com/publications/TR2019-123}
    • }
  •  Manilow, E., Wichern, G., Seetharaman, P., Le Roux, J., "Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937170, October 2019, pp. 45-49.
    BibTeX TR2019-124 PDF
    • @inproceedings{Manilow2019oct,
    • author = {Manilow, Ethan and Wichern, Gordon and Seetharaman, Prem and Le Roux, Jonathan},
    • title = {Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2019,
    • pages = {45--49},
    • month = oct,
    • doi = {10.1109/WASPAA.2019.8937170},
    • issn = {1947-1629},
    • isbn = {978-1-7281-1123-0},
    • url = {https://www.merl.com/publications/TR2019-124}
    • }
  •  Baskar, M.K., Watanabe, S., Astudillo, R., Hori, T., Burget, L., Cernocky, J.H., "Semi-supervised Sequence-to-sequence ASR using Unpaired Speech and Text", Interspeech, DOI: 10.21437/Interspeech.2019-3167, September 2019, pp. 3790-3794.
    BibTeX TR2019-100 PDF
    • @inproceedings{Baskar2019sep,
    • author = {Baskar, Murali Karthick and Watanabe, Shinji and Astudillo, Ramon and Hori, Takaaki and Burget, Lukas and Cernocky, Jan, Honza},
    • title = {Semi-supervised Sequence-to-sequence ASR using Unpaired Speech and Text},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3790--3794},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-3167},
    • issn = {1990-9772},
    • url = {https://www.merl.com/publications/TR2019-100}
    • }
  •  Hori, C., Cherian, A., Marks, T., Hori, T., "Joint Student-Teacher Learning for Audio-Visual Scene-Aware Dialog", Interspeech, September 2019, pp. 1886-1890.
    BibTeX TR2019-097 PDF
    • @inproceedings{Hori2019sep,
    • author = {Hori, Chiori and Cherian, Anoop and Marks, Tim and Hori, Takaaki},
    • title = {Joint Student-Teacher Learning for Audio-Visual Scene-Aware Dialog},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {1886--1890},
    • month = sep,
    • publisher = {ISCA},
    • url = {https://www.merl.com/publications/TR2019-097}
    • }
  •  Karafiat, M., Baskar, M.K., Watanabe, S., Hori, T., Wiesner, M., Cernocky, J.H., "Analysis of Multilingual Sequence-to-Sequence Speech Recognition Systems", Interspeech, DOI: 10.21437/Interspeech.2019-2355//, September 2019, pp. 2019-2355.
    BibTeX TR2019-103 PDF
    • @inproceedings{Karafiat2019sep,
    • author = {Karafiat, Martin and Baskar, Murali Karthick and Watanabe, Shinji and Hori, Takaaki and Wiesner, Matthew and Cernocky, Jan, Honza},
    • title = {Analysis of Multilingual Sequence-to-Sequence Speech Recognition Systems},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {2019--2355},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2355//},
    • url = {https://www.merl.com/publications/TR2019-103}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2837, September 2019, pp. 76-80.
    BibTeX TR2019-098 PDF
    • @inproceedings{Moritz2019sep,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {76--80},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2837},
    • url = {https://www.merl.com/publications/TR2019-098}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-3038, September 2019, pp. 3755-3759.
    BibTeX TR2019-101 PDF
    • @inproceedings{Seki2019sep,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
    • title = {End-to-End Multilingual Multi-Speaker Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3755--3759},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-3038},
    • url = {https://www.merl.com/publications/TR2019-101}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Moritz, N., Le Roux, J., "Vectorized Beam Search for CTC-Attention-based Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2860, September 2019, pp. 3825-3829.
    BibTeX TR2019-102 PDF
    • @inproceedings{Seki2019sep2,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Moritz, Niko and Le Roux, Jonathan},
    • title = {Vectorized Beam Search for CTC-Attention-based Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3825--3829},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2860},
    • url = {https://www.merl.com/publications/TR2019-102}
    • }
  •  Wichern, G., McQuinn, E., Antognini, J., Flynn, M., Zhu, R., Crow, D., Manilow, E., Le Roux, J., "WHAM!: Extending Speech Separation to Noisy Environments", Interspeech, DOI: 10.21437/Interspeech.2019-2821, September 2019, pp. 1368-1372.
    BibTeX TR2019-099 PDF
    • @inproceedings{Wichern2019sep,
    • author = {Wichern, Gordon and McQuinn, Emmett and Antognini, Joe and Flynn, Michael and Zhu, Richard and Crow, Dwight and Manilow, Ethan and Le Roux, Jonathan},
    • title = {WHAM!: Extending Speech Separation to Noisy Environments},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {1368--1372},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2821},
    • url = {https://www.merl.com/publications/TR2019-099}
    • }
  •  Yalta, N., Watanabe, S., Hori, T., Nakadai, K., Ogata, T., "CNN-based Multichannel End-to-End Speech Recognition for Everyday Home Environments", European Signal Processing Conference (EUSIPCO), DOI: 10.23919/EUSIPCO.2019.8902524, September 2019, pp. 1-5.
    BibTeX TR2019-094 PDF
    • @inproceedings{Yalta2019sep,
    • author = {Yalta, Nelson and Watanabe, Shinji and Hori, Takaaki and Nakadai, Kazuhiro and Ogata, Tetsuya},
    • title = {CNN-based Multichannel End-to-End Speech Recognition for Everyday Home Environments},
    • booktitle = {European Signal Processing Conference (EUSIPCO)},
    • year = 2019,
    • pages = {1--5},
    • month = sep,
    • doi = {10.23919/EUSIPCO.2019.8902524},
    • url = {https://www.merl.com/publications/TR2019-094}
    • }
  •  Alamri, H., Cartillier, V., Das, A., Wang, J., Lee, S., Anderson, P., Essa, I., Parikh, D., Batra, D., Cherian, A., Marks, T.K., Hori, C., "Audio-Visual Scene-Aware Dialog", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), DOI: 10.1109/CVPR.2019.00774, June 2019, pp. 7550-7559.
    BibTeX TR2019-048 PDF
    • @inproceedings{Alamri2019jun,
    • author = {Alamri, Huda and Cartillier, Vincent and Das, Abhishek and Wang, Jue and Lee, Stefan and Anderson, Peter and Essa, Irfan and Parikh, Devi and Batra, Dhruv and Cherian, Anoop and Marks, Tim K. and Hori, Chiori},
    • title = {Audio-Visual Scene-Aware Dialog},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2019,
    • pages = {7550--7559},
    • month = jun,
    • doi = {10.1109/CVPR.2019.00774},
    • url = {https://www.merl.com/publications/TR2019-048}
    • }
  •  Aihara, R., Hanazawa, T., Okato, Y., Wichern, G., Le Roux, J., "Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682695, May 2019.
    BibTeX TR2019-003 PDF
    • @inproceedings{Aihara2019may,
    • author = {Aihara, Ryo and Hanazawa, Toshiyuki and Okato, Yohei and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682695},
    • url = {https://www.merl.com/publications/TR2019-003}
    • }