Publications

73 / 3,608 publications found.


  •  Kavalerov, I., Wisdom, S., Erdogan, H., Patton, B., Wilson, K., Le Roux, J., Hershey, J., "Universal Sound Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA.2019.8937253, October 2019, pp. 170-174.
    BibTeX TR2019-123 PDF
    • @inproceedings{Kavalerov2019oct,
    • author = {Kavalerov, Ilya and Wisdom, Scott and Erdogan, Hakan and Patton, Brian and Wilson, Kevin and Le Roux, Jonathan and Hershey, John},
    • title = {Universal Sound Separation},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2019,
    • pages = {170--174},
    • month = oct,
    • doi = {10.1109/WASPAA.2019.8937253},
    • issn = {1947-1629},
    • isbn = {978-1-7281-1123-0},
    • url = {https://www.merl.com/publications/TR2019-123}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2019-3038, September 2019, pp. 3755-3759.
    BibTeX TR2019-101 PDF
    • @inproceedings{Seki2019sep,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
    • title = {End-to-End Multilingual Multi-Speaker Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3755--3759},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-3038},
    • url = {https://www.merl.com/publications/TR2019-101}
    • }
  •  Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682587, May 2019.
    BibTeX TR2019-008 PDF
    • @inproceedings{LeRoux2019may2,
    • author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
    • title = {The Phasebook: Building Complex Masks via Discrete Representations for Source Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682587},
    • url = {https://www.merl.com/publications/TR2019-008}
    • }
  •  Le Roux, J., Wisdom, S., Erdogan, H., Hershey, J., "SDR -- Half-Baked or Well Done?", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683855, May 2019.
    BibTeX TR2019-013 PDF
    • @inproceedings{LeRoux2019may,
    • author = {Le Roux, Jonathan and Wisdom, Scott and Erdogan, Hakan and Hershey, John},
    • title = {SDR -- Half-Baked or Well Done?},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683855},
    • url = {https://www.merl.com/publications/TR2019-013}
    • }
  •  Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "Phasebook and Friends: Leveraging discrete representations for source separation", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2019.2904183, Vol. 13, No. 2, pp. 370-382, March 2019.
    BibTeX TR2018-199 PDF
    • @article{LeRoux2019mar,
    • author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
    • title = {Phasebook and Friends: Leveraging discrete representations for source separation},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2019,
    • volume = 13,
    • number = 2,
    • pages = {370--382},
    • month = mar,
    • doi = {10.1109/JSTSP.2019.2904183},
    • url = {https://www.merl.com/publications/TR2018-199}
    • }
  •  Hori, T., Wang, W., Koji, Y., Hori, C., Harsham, B.A., Hershey, J., "Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models", Computer Speech and Language, DOI: 10.1016/​j.csl.2018.08.006, Vol. 54, pp. 122-139, December 2018.
    BibTeX TR2018-161 PDF
    • @article{Hori2018dec2,
    • author = {Hori, Takaaki and Wang, Wen and Koji, Yusuke and Hori, Chiori and Harsham, Bret A. and Hershey, John},
    • title = {Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models},
    • journal = {Computer Speech and Language},
    • year = 2018,
    • volume = 54,
    • pages = {122--139},
    • month = dec,
    • publisher = {Elsevier},
    • doi = {10.1016/j.csl.2018.08.006},
    • url = {https://www.merl.com/publications/TR2018-161}
    • }
  •  Wang, Z.-Q., Le Roux, J., Wang, D., Hershey, J., "End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction", Interspeech, September 2018.
    BibTeX TR2018-135 PDF
    • @inproceedings{Wang2018sep,
    • author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Wang, DeLiang and Hershey, John},
    • title = {End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction},
    • booktitle = {Interspeech},
    • year = 2018,
    • month = sep,
    • url = {https://www.merl.com/publications/TR2018-135}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "A Purely End-to-end System for Multi-speaker Speech Recognition", Annual Meeting of the Association for Computational Linguistics (ACL), July 2018, pp. 2620-2630.
    BibTeX TR2018-104 PDF Video
    • @inproceedings{Seki2018jul,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
    • title = {A Purely End-to-end System for Multi-speaker Speech Recognition},
    • booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)},
    • year = 2018,
    • pages = {2620--2630},
    • month = jul,
    • publisher = {Elsevier},
    • url = {https://www.merl.com/publications/TR2018-104}
    • }
  •  Erdogan, H., Hershey, J., Watanabe, S., Le Roux, J., "Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 7, Springer, July 2018.
    BibTeX
    • @incollection{Erdogan2018jul,
    • author = {Erdogan, Hakan and Hershey, John and Watanabe, Shinji and Le Roux, Jonathan},
    • title = {Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio},
    • booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
    • year = 2018,
    • editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
    • chapter = 7,
    • month = jul,
    • publisher = {Springer},
    • isbn = {978-3-319-64680-0}
    • }
  •  Hershey, J., Le Roux, J., Watanabe, S., Wisdom, S., Chen, Z., Isik, Y., "Novel deep architectures in speech processing" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 6, Springer, July 9, 2018.
    BibTeX
    • @incollection{Hershey2018jul,
    • author = {Hershey, John and Le Roux, Jonathan and Watanabe, Shinji and Wisdom, Scott and Chen, Zhuo and Isik, Yusuf},
    • title = {Novel deep architectures in speech processing},
    • booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
    • year = 2018,
    • editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
    • chapter = 6,
    • month = jul,
    • publisher = {Springer}
    • }
  •  Watanabe, S., Hori, T., Miao, Y., Delcroix, M., Metze, F., Hershey, J., "Toolkits for robust speech processing" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 14, Springer, July 9, 2018.
    BibTeX
    • @incollection{Watanabe2018jul,
    • author = {Watanabe, Shinji and Hori, Takaaki and Miao, Yajie and Delcroix, Marc and Metze, Florian and Hershey, John},
    • title = {Toolkits for robust speech processing},
    • booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
    • year = 2018,
    • editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
    • chapter = 14,
    • month = jul,
    • publisher = {Springer}
    • }
  •  Xiao, X., Watanabe, S., Erdogan, H., Mandel, M., Lu, L., Hershey, J., Seltzer, M., Chen, G., Zhang, Y., Yu, D., "Discriminative beamforming with phase aware neural networks for speech enhancement and recognition" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 4, Springer, July 9, 2018.
    BibTeX
    • @incollection{Xiao2018jul2,
    • author = {Xiao, Xiong and Watanabe, Shinji and Erdogan, Hakan and Mandel, Michael and Lu, Liang and Hershey, John and Seltzer, Mike and Chen, Guoguo and Zhang, Yu and Yu, Dong},
    • title = {Discriminative beamforming with phase aware neural networks for speech enhancement and recognition},
    • booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
    • year = 2018,
    • editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
    • chapter = 4,
    • month = jul,
    • publisher = {Springer}
    • }
  •  Ochiai, T., Watanabe, S., Katagiri, S., Hori, T., Hershey, J.R., "Speaker Adaptation for Multichannel End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2018.8462161, April 2018, pp. 6707-6711.
    BibTeX TR2018-006 PDF
    • @inproceedings{Ochiai2018apr,
    • author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru and Hori, Takaaki and Hershey, John R.},
    • title = {Speaker Adaptation for Multichannel End-to-End Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2018,
    • pages = {6707--6711},
    • month = apr,
    • doi = {10.1109/ICASSP.2018.8462161},
    • url = {https://www.merl.com/publications/TR2018-006}
    • }
  •  Seki, H., Watanabe, S., Hori, T., Le Roux, J., Hershey, J.R., "An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2018.8462180, April 2018, pp. 4919-4923.
    BibTeX TR2018-002 PDF Video
    • @inproceedings{Seki2018apr,
    • author = {Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and Le Roux, Jonathan and Hershey, John R.},
    • title = {An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2018,
    • pages = {4919--4923},
    • month = apr,
    • doi = {10.1109/ICASSP.2018.8462180},
    • url = {https://www.merl.com/publications/TR2018-002}
    • }
  •  Settle, S., Le Roux, J., Hori, T., Watanabe, S., Hershey, J.R., "End-to-End Multi-Speaker Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2018.8461893, April 2018, pp. 4819-4823.
    BibTeX TR2018-001 PDF Video
    • @inproceedings{Settle2018apr,
    • author = {Settle, Shane and Le Roux, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
    • title = {End-to-End Multi-Speaker Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2018,
    • pages = {4819--4823},
    • month = apr,
    • doi = {10.1109/ICASSP.2018.8461893},
    • url = {https://www.merl.com/publications/TR2018-001}
    • }
  •  Wang, Z.-Q., Le Roux, J., Hershey, J.R., "Multi-Channel Deep Clustering: Discriminative Spectral and Spatial Embeddings for Speaker-Independent Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2018.8461639, April 2018, pp. 1-5.
    BibTeX TR2018-007 PDF
    • @inproceedings{Wang2018apr2,
    • author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Hershey, John R.},
    • title = {Multi-Channel Deep Clustering: Discriminative Spectral and Spatial Embeddings for Speaker-Independent Speech Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2018,
    • pages = {1--5},
    • month = apr,
    • doi = {10.1109/ICASSP.2018.8461639},
    • url = {https://www.merl.com/publications/TR2018-007}
    • }
  •  Wang, Z.-Q., Le Roux, J., Hershey, J.R., "Alternative Objective Functions for Deep Clustering", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2018.8462507, April 2018, pp. 686-690.
    BibTeX TR2018-005 PDF
    • @inproceedings{Wang2018apr,
    • author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Hershey, John R.},
    • title = {Alternative Objective Functions for Deep Clustering},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2018,
    • pages = {686--690},
    • month = apr,
    • doi = {10.1109/ICASSP.2018.8462507},
    • url = {https://www.merl.com/publications/TR2018-005}
    • }
  •  Hori, C., Hori, T., Marks, T.K., Hershey, J.R., "Early and Late Integration of Audio Features for Automatic Video Description", IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), DOI: 10.1109/​ASRU.2017.8268968, December 2017.
    BibTeX TR2017-183 PDF
    • @inproceedings{Hori2017dec2,
    • author = {Hori, Chiori and Hori, Takaaki and Marks, Tim K. and Hershey, John R.},
    • title = {Early and Late Integration of Audio Features for Automatic Video Description},
    • booktitle = {IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
    • year = 2017,
    • month = dec,
    • doi = {10.1109/ASRU.2017.8268968},
    • url = {https://www.merl.com/publications/TR2017-183}
    • }
  •  Hori, T., Watanabe, S., Hershey, J.R., "Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/​ASRU.2017.8268948, December 2017.
    BibTeX TR2017-181 PDF
    • @inproceedings{Hori2017dec,
    • author = {Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
    • title = {Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2017,
    • month = dec,
    • doi = {10.1109/ASRU.2017.8268948},
    • url = {https://www.merl.com/publications/TR2017-181}
    • }
  •  Watanabe, S., Hori, T., Hershey, J.R., "Language Independent End-to-End Architecture For Joint Language and Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/​ASRU.2017.8268945, December 2017.
    BibTeX TR2017-182 PDF Video
    • @inproceedings{Watanabe2017dec,
    • author = {Watanabe, Shinji and Hori, Takaaki and Hershey, John R.},
    • title = {Language Independent End-to-End Architecture For Joint Language and Speech Recognition},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2017,
    • month = dec,
    • doi = {10.1109/ASRU.2017.8268945},
    • url = {https://www.merl.com/publications/TR2017-182}
    • }
  •  Wang, W., Koji, Y., Harsham, B.A., Hori, T., Hershey, J.R., "Sequence Adversarial Training and Minimum Bayes Risk Decoding for End-to-end Neural Conversation Models", Dialog System Technology Challenges, December 2017.
    BibTeX TR2017-180 PDF
    • @inproceedings{Wang2017dec,
    • author = {Wang, Wen and Koji, Yusuke and Harsham, Bret A. and Hori, Takaaki and Hershey, John R.},
    • title = {Sequence Adversarial Training and Minimum Bayes Risk Decoding for End-to-end Neural Conversation Models},
    • booktitle = {Dialog System Technology Challenges},
    • year = 2017,
    • month = dec,
    • url = {https://www.merl.com/publications/TR2017-180}
    • }
  •  Hori, C., Hori, T., Lee, T.-Y., Zhang, Z., Harsham, B.A., Sumi, K., Marks, T.K., Hershey, J.R., "Attention-Based Multimodal Fusion for Video Description", IEEE International Conference on Computer Vision (ICCV), DOI: 10.1109/​ICCV.2017.450, October 2017.
    BibTeX TR2017-156 PDF
    • @inproceedings{Hori2017oct,
    • author = {Hori, Chiori and Hori, Takaaki and Lee, Teng-Yok and Zhang, Ziming and Harsham, Bret A. and Sumi, Kazuhiko and Marks, Tim K. and Hershey, John R.},
    • title = {Attention-Based Multimodal Fusion for Video Description},
    • booktitle = {IEEE International Conference on Computer Vision (ICCV)},
    • year = 2017,
    • month = oct,
    • doi = {10.1109/ICCV.2017.450},
    • url = {https://www.merl.com/publications/TR2017-156}
    • }
  •  Ochiai, T., Watanabe, S., Hori, T., Hershey, J.R., Xiao, X., "Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2017.2764276, Vol. 11, No. 8, pp. 1274-1288, October 2017.
    BibTeX TR2017-192 PDF
    • @article{Ochiai2017oct2,
    • author = {Ochiai, Tsubasa and Watanabe, Shinji and Hori, Takaaki and Hershey, John R. and Xiao, Xiong},
    • title = {Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2017,
    • volume = 11,
    • number = 8,
    • pages = {1274--1288},
    • month = oct,
    • doi = {10.1109/JSTSP.2017.2764276},
    • issn = {1941-0484},
    • url = {https://www.merl.com/publications/TR2017-192}
    • }
  •  Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T., "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2017.2763455, Vol. 11, No. 8, pp. 1240-1253, October 2017.
    BibTeX TR2017-190 PDF Video
    • @article{Watanabe2017oct,
    • author = {Watanabe, Shinji and Hori, Takaaki and Kim, Suyoun and Hershey, John R. and Hayashi, Tomoki},
    • title = {Hybrid CTC/Attention Architecture for End-to-End Speech Recognition},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2017,
    • volume = 11,
    • number = 8,
    • pages = {1240--1253},
    • month = oct,
    • doi = {10.1109/JSTSP.2017.2763455},
    • issn = {1941-0484},
    • url = {https://www.merl.com/publications/TR2017-190}
    • }
  •  Ochiai, T., Watanabe, S., Hori, T., Hershey, J.R., "Multichannel End-to-end Speech Recognition", International Conference on Machine Learning (ICML), August 2017.
    BibTeX TR2017-107 PDF
    • @inproceedings{Ochiai2017aug,
    • author = {Ochiai, Tsubasa and Watanabe, Shinji and Hori, Takaaki and Hershey, John R.},
    • title = {Multichannel End-to-end Speech Recognition},
    • booktitle = {International Conference on Machine Learning (ICML)},
    • year = 2017,
    • month = aug,
    • url = {https://www.merl.com/publications/TR2017-107}
    • }