Publications

Cho, J., Watanabe, S., Hori, T., Baskar, M.K., Inaguma, H., Villalba, J., Dehak, N., "Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683380, May 2019.
BibTeX TR2019-007 PDF
- @inproceedings{Cho2019may,
- author = {Cho, Jaejin and Watanabe, Shinji and Hori, Takaaki and Baskar, Murali Karthick and Inaguma, Hirofumi and Villalba, Jesus and Dehak, Najim},
- title = {{Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683380},
- url = {https://www.merl.com/publications/TR2019-007}
- }
Hori, C., Alamri, H., Wang, J., Wichern, G., Hori, T., Cherian, A., Marks, T.K., Cartillier, V., Lopes, R., Das, A., Essa, I., Batra, D., Parikh, D., "End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682583, May 2019.
BibTeX TR2019-016 PDF
- @inproceedings{Hori2019may2,
- author = {Hori, Chiori and Alamri, Huda and Wang, Jue and Wichern, Gordon and Hori, Takaaki and Cherian, Anoop and Marks, Tim K. and Cartillier, Vincent and Lopes, Raphael and Das, Abhishek and Essa, Irfan and Batra, Dhruv and Parikh, Devi},
- title = {{End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682583},
- url = {https://www.merl.com/publications/TR2019-016}
- }
Hori, T., Astudillo, R., Hayashi, T., Zhang, Y., Watanabe, S., Le Roux, J., "Cycle-Consistency Training for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683307, May 2019.
BibTeX TR2019-002 PDF
- @inproceedings{Hori2019may,
- author = {Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Cycle-Consistency Training for End-to-End Speech Recognition }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683307},
- url = {https://www.merl.com/publications/TR2019-002}
- }
Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683510, May 2019.
BibTeX TR2019-015 PDF
- @inproceedings{Moritz2019may,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Triggered Attention for End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683510},
- url = {https://www.merl.com/publications/TR2019-015}
- }
Wang, X., Li, R., Mallidi, S.H., Hori, T., Watanabe, S., Hermansky, H., "Stream Attention-Based Multi-Array End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682650, May 2019.
BibTeX TR2019-005 PDF
- @inproceedings{Wang2019may,
- author = {Wang, Xiaofei and Li, Ruizhi and Mallidi, Sri Harish and Hori, Takaaki and Watanabe, Shinji and Hermansky, Hynek},
- title = {{Stream Attention-Based Multi-Array End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682650},
- url = {https://www.merl.com/publications/TR2019-005}
- }
Cho, J., Baskar, M.K., Li, R., Wiesner, M., Mallidi, S.H., Yalta, N., Karafiat, M., Watanabe, S., Hori, T., "Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639655, December 2018.
BibTeX TR2018-175 PDF
- @inproceedings{Cho2018dec,
- author = {Cho, Jaejin and Baskar, Murali Karthick and Li, Ruizhi and Wiesner, Matthew and Mallidi, Sri Harish and Yalta, Nelson and Karafiat, Martin and Watanabe, Shinji and Hori, Takaaki},
- title = {{Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639655},
- url = {https://www.merl.com/publications/TR2018-175}
- }
Hayashi, T., Watanabe, S., Zhang, Y., Toda, T., Hori, T., Astudillo, R., Takeda, K., "Back-Translation-Style Data Augmentation for End-to-End ASR", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639619, December 2018.
BibTeX TR2018-174 PDF
- @inproceedings{Hayashi2018dec,
- author = {Hayashi, Tomoki and Watanabe, Shinji and Zhang, Yu and Toda, Tomoki and Hori, Takaaki and Astudillo, Ramon and Takeda, Kazuya},
- title = {{Back-Translation-Style Data Augmentation for End-to-End ASR}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639619},
- url = {https://www.merl.com/publications/TR2018-174}
- }
Hori, T., Cho, J., Watanabe, S., "End-to-End Speech Recognition with Word-Based RNN Language Models", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639693, December 2018.
BibTeX TR2018-176 PDF
- @inproceedings{Hori2018dec,
- author = {Hori, Takaaki and Cho, Jaejin and Watanabe, Shinji},
- title = {{End-to-End Speech Recognition with Word-Based RNN Language Models}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639693},
- url = {https://www.merl.com/publications/TR2018-176}
- }
Hori, T., Wang, W., Koji, Y., Hori, C., Harsham, B.A., Hershey, J., "Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models", Computer Speech and Language, DOI: 10.1016/j.csl.2018.08.006, Vol. 54, pp. 122-139, December 2018.
BibTeX TR2018-161 PDF
- @article{Hori2018dec2,
- author = {Hori, Takaaki and Wang, Wen and Koji, Yusuke and Hori, Chiori and Harsham, Bret A. and Hershey, John},
- title = {{Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models}},
- journal = {Computer Speech and Language},
- year = 2018,
- volume = 54,
- pages = {122--139},
- month = dec,
- publisher = {Elsevier},
- doi = {10.1016/j.csl.2018.08.006},
- url = {https://www.merl.com/publications/TR2018-161}
- }
Watanabe, S., Hori, T., Karita, S., Hayashi, T., Nishitoba, J., Unno, Y., Enrique Yalta Soplin, N., Heymann, J., Wiesner, M., Chen, N., Renduchintala, A., Ochiai, T., "ESPnet: End-to-End Speech Processing Toolkit", Interspeech, September 2018.
BibTeX TR2018-136 PDF
- @inproceedings{Watanabe2018sep,
- author = {Watanabe, Shinji and Hori, Takaaki and Karita, Shigeki and Hayashi, Tomoki and Nishitoba, Jiro and Unno, Yuya and Enrique Yalta Soplin, Nelson and Heymann, Jahn and Wiesner, Matthew and Chen, Nanxin and Renduchintala, Adithya and Ochiai, Tsubasa},
- title = {{ESPnet: End-to-End Speech Processing Toolkit}},
- booktitle = {Interspeech},
- year = 2018,
- month = sep,
- url = {https://www.merl.com/publications/TR2018-136}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "A Purely End-to-end System for Multi-speaker Speech Recognition", Annual Meeting of the Association for Computational Linguistics (ACL), July 2018, pp. 2620-2630.
BibTeX TR2018-104 PDF Video
- @inproceedings{Seki2018jul,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan and Hershey, John},
- title = {{A Purely End-to-end System for Multi-speaker Speech Recognition}},
- booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)},
- year = 2018,
- pages = {2620--2630},
- month = jul,
- publisher = {Elsevier},
- url = {https://www.merl.com/publications/TR2018-104}
- }
Watanabe, S., Hori, T., Miao, Y., Delcroix, M., Metze, F., Hershey, J., "Toolkits for robust speech processing" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 14, Springer, July 9, 2018.
BibTeX
- @incollection{Watanabe2018jul,
- author = {Watanabe, Shinji and Hori, Takaaki and Miao, Yajie and Delcroix, Marc and Metze, Florian and Hershey, John},
- title = {{Toolkits for robust speech processing}},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 14,
- month = jul,
- publisher = {Springer}
- }
Ochiai, T., Watanabe, S., Katagiri, S., Hori, T., Hershey, J.R., "Speaker Adaptation for Multichannel End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462161, April 2018, pp. 6707-6711.
BibTeX TR2018-006 PDF
- @inproceedings{Ochiai2018apr,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru and Hori, Takaaki and Hershey, John R.},
- title = {{Speaker Adaptation for Multichannel End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {6707--6711},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462161},
- url = {https://www.merl.com/publications/TR2018-006}
- }
Seki, H., Watanabe, S., Hori, T., Le Roux, J., Hershey, J.R., "An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462180, April 2018, pp. 4919-4923.
BibTeX TR2018-002 PDF Video
- @inproceedings{Seki2018apr,
- author = {Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and {Le Roux}, Jonathan and Hershey, John R.},
- title = {{An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4919--4923},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462180},
- url = {https://www.merl.com/publications/TR2018-002}
- }
Settle, S., Le Roux, J., Hori, T., Watanabe, S., Hershey, J.R., "End-to-End Multi-Speaker Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8461893, April 2018, pp. 4819-4823.
BibTeX TR2018-001 PDF Video
- @inproceedings{Settle2018apr,
- author = {Settle, Shane and {Le Roux}, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{End-to-End Multi-Speaker Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4819--4823},
- month = apr,
- doi = {10.1109/ICASSP.2018.8461893},
- url = {https://www.merl.com/publications/TR2018-001}
- }
Hori, C., Hori, T., "End-to-end Conversation Modeling Track in DSTC6", Dialog System Technology Challenges, December 2017.
BibTeX TR2017-188 PDF
- @inproceedings{Hori2017dec3,
- author = {Hori, Chiori and Hori, Takaaki},
- title = {{End-to-end Conversation Modeling Track in DSTC6}},
- booktitle = {Dialog System Technology Challenges},
- year = 2017,
- month = dec,
- url = {https://www.merl.com/publications/TR2017-188}
- }
Hori, C., Hori, T., Marks, T.K., Hershey, J.R., "Early and Late Integration of Audio Features for Automatic Video Description", IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), DOI: 10.1109/ASRU.2017.8268968, December 2017.
BibTeX TR2017-183 PDF
- @inproceedings{Hori2017dec2,
- author = {Hori, Chiori and Hori, Takaaki and Marks, Tim K. and Hershey, John R.},
- title = {{Early and Late Integration of Audio Features for Automatic Video Description}},
- booktitle = {IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268968},
- url = {https://www.merl.com/publications/TR2017-183}
- }
Hori, T., Watanabe, S., Hershey, J.R., "Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU.2017.8268948, December 2017.
BibTeX TR2017-181 PDF
- @inproceedings{Hori2017dec,
- author = {Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268948},
- url = {https://www.merl.com/publications/TR2017-181}
- }
Watanabe, S., Hori, T., Hershey, J.R., "Language Independent End-to-End Architecture For Joint Language and Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU.2017.8268945, December 2017.
BibTeX TR2017-182 PDF Video
- @inproceedings{Watanabe2017dec,
- author = {Watanabe, Shinji and Hori, Takaaki and Hershey, John R.},
- title = {{Language Independent End-to-End Architecture For Joint Language and Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268945},
- url = {https://www.merl.com/publications/TR2017-182}
- }
Wang, W., Koji, Y., Harsham, B.A., Hori, T., Hershey, J.R., "Sequence Adversarial Training and Minimum Bayes Risk Decoding for End-to-end Neural Conversation Models", Dialog System Technology Challenges, December 2017.
BibTeX TR2017-180 PDF
- @inproceedings{Wang2017dec,
- author = {Wang, Wen and Koji, Yusuke and Harsham, Bret A. and Hori, Takaaki and Hershey, John R.},
- title = {{Sequence Adversarial Training and Minimum Bayes Risk Decoding for End-to-end Neural Conversation Models}},
- booktitle = {Dialog System Technology Challenges},
- year = 2017,
- month = dec,
- url = {https://www.merl.com/publications/TR2017-180}
- }
Hori, C., Hori, T., Lee, T.-Y., Zhang, Z., Harsham, B.A., Sumi, K., Marks, T.K., Hershey, J.R., "Attention-Based Multimodal Fusion for Video Description", IEEE International Conference on Computer Vision (ICCV), DOI: 10.1109/ICCV.2017.450, October 2017.
BibTeX TR2017-156 PDF
- @inproceedings{Hori2017oct,
- author = {Hori, Chiori and Hori, Takaaki and Lee, Teng-Yok and Zhang, Ziming and Harsham, Bret A. and Sumi, Kazuhiko and Marks, Tim K. and Hershey, John R.},
- title = {{Attention-Based Multimodal Fusion for Video Description}},
- booktitle = {IEEE International Conference on Computer Vision (ICCV)},
- year = 2017,
- month = oct,
- doi = {10.1109/ICCV.2017.450},
- url = {https://www.merl.com/publications/TR2017-156}
- }
Ochiai, T., Watanabe, S., Hori, T., Hershey, J.R., Xiao, X., "Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2017.2764276, Vol. 11, No. 8, pp. 1274-1288, October 2017.
BibTeX TR2017-192 PDF
- @article{Ochiai2017oct2,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Hori, Takaaki and Hershey, John R. and Xiao, Xiong},
- title = {{Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1274--1288},
- month = oct,
- doi = {10.1109/JSTSP.2017.2764276},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2017-192}
- }
Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T., "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2017.2763455, Vol. 11, No. 8, pp. 1240-1253, October 2017.
BibTeX TR2017-190 PDF Video
- @article{Watanabe2017oct,
- author = {Watanabe, Shinji and Hori, Takaaki and Kim, Suyoun and Hershey, John R. and Hayashi, Tomoki},
- title = {{Hybrid CTC/Attention Architecture for End-to-End Speech Recognition}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1240--1253},
- month = oct,
- doi = {10.1109/JSTSP.2017.2763455},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2017-190}
- }
Hori, T., Watanabe, S., Zhang, Y., Chan, W., "Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM", Interspeech, August 2017.
BibTeX TR2017-132 PDF Video
- @inproceedings{Hori2017aug,
- author = {Hori, Takaaki and Watanabe, Shinji and Zhang, Yu and Chan, William},
- title = {{Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM}},
- booktitle = {Interspeech},
- year = 2017,
- month = aug,
- url = {https://www.merl.com/publications/TR2017-132}
- }
Hayashi, T., Watanabe, S., Toda, T., Hori, T., Le Roux, J., Takeda, K., "Duration-Controlled LSTM for Polyphonic Sound Event Detection", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2017.2740002, Vol. 25, No. 11, August 2017.
BibTeX TR2017-150 PDF
- @article{Hayashi2017aug,
- author = {Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki and Hori, Takaaki and {Le Roux}, Jonathan and Takeda, Kazuya},
- title = {{Duration-Controlled LSTM for Polyphonic Sound Event Detection}},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2017,
- volume = 25,
- number = 11,
- month = aug,
- doi = {10.1109/TASLP.2017.2740002},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2017-150}
- }