Publications

Aihara, R., Wichern, G., Le Roux, J., "Deep Clustering-based Single Channel Speech Separation and Recent Advances", Acoustical Science and Technology, DOI: 10.1250/ast.41.465, Vol. 41, No. 2, pp. 465-471, March 2020.
BibTeX TR2021-020 PDF
- @article{Aihara2020jun,
- author = {Aihara, Ryo and Wichern, Gordon and Le Roux, Jonathan},
- title = {Deep Clustering-based Single Channel Speech Separation and Recent Advances},
- journal = {Acoustical Science and Technology},
- year = 2020,
- volume = 41,
- number = 2,
- pages = {465--471},
- month = mar,
- doi = {10.1250/ast.41.465},
- url = {https://www.merl.com/publications/TR2021-020}
- }
Chang, X., Zhang, W., Qian, Y., Le Roux, J., Watanabe, S., "MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 237-144.
BibTeX TR2019-157 PDF
- @inproceedings{Chang2019dec,
- author = {Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and Le Roux, Jonathan and Watanabe, Shinji},
- title = {MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {237--144},
- month = dec,
- isbn = {978-1-7281-0305-1},
- url = {https://www.merl.com/publications/TR2019-157}
- }
Moritz, N., Hori, T., Le Roux, J., "Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 936-943.
BibTeX TR2019-159 PDF
- @inproceedings{Moritz2019dec,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Streaming End-to-End Speech Recognition with Joint CTC-Attention Based Models},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {936--943},
- month = dec,
- isbn = {978-1-7281-0305-1},
- url = {https://www.merl.com/publications/TR2019-159}
- }
Kavalerov, I., Wisdom, S., Erdogan, H., Patton, B., Wilson, K., Le Roux, J., Hershey, J., "Universal Sound Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937253, October 2019, pp. 170-174.
BibTeX TR2019-123 PDF
- @inproceedings{Kavalerov2019oct,
- author = {Kavalerov, Ilya and Wisdom, Scott and Erdogan, Hakan and Patton, Brian and Wilson, Kevin and Le Roux, Jonathan and Hershey, John},
- title = {Universal Sound Separation},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2019,
- pages = {170--174},
- month = oct,
- doi = {10.1109/WASPAA.2019.8937253},
- issn = {1947-1629},
- isbn = {978-1-7281-1123-0},
- url = {https://www.merl.com/publications/TR2019-123}
- }
Manilow, E., Wichern, G., Seetharaman, P., Le Roux, J., "Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937170, October 2019, pp. 45-49.
BibTeX TR2019-124 PDF
- @inproceedings{Manilow2019oct,
- author = {Manilow, Ethan and Wichern, Gordon and Seetharaman, Prem and Le Roux, Jonathan},
- title = {Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2019,
- pages = {45--49},
- month = oct,
- doi = {10.1109/WASPAA.2019.8937170},
- issn = {1947-1629},
- isbn = {978-1-7281-1123-0},
- url = {https://www.merl.com/publications/TR2019-124}
- }
Moritz, N., Hori, T., Le Roux, J., "Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2837, September 2019, pp. 76-80.
BibTeX TR2019-098 PDF
- @inproceedings{Moritz2019sep,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition},
- booktitle = {Interspeech},
- year = 2019,
- pages = {76--80},
- month = sep,
- doi = {10.21437/Interspeech.2019-2837},
- url = {https://www.merl.com/publications/TR2019-098}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-3038, September 2019, pp. 3755-3759.
BibTeX TR2019-101 PDF
- @inproceedings{Seki2019sep,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
- title = {End-to-End Multilingual Multi-Speaker Speech Recognition},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3755--3759},
- month = sep,
- doi = {10.21437/Interspeech.2019-3038},
- url = {https://www.merl.com/publications/TR2019-101}
- }
Seki, H., Hori, T., Watanabe, S., Moritz, N., Le Roux, J., "Vectorized Beam Search for CTC-Attention-based Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2860, September 2019, pp. 3825-3829.
BibTeX TR2019-102 PDF
- @inproceedings{Seki2019sep2,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Moritz, Niko and Le Roux, Jonathan},
- title = {Vectorized Beam Search for CTC-Attention-based Speech Recognition},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3825--3829},
- month = sep,
- doi = {10.21437/Interspeech.2019-2860},
- url = {https://www.merl.com/publications/TR2019-102}
- }
Wichern, G., McQuinn, E., Antognini, J., Flynn, M., Zhu, R., Crow, D., Manilow, E., Le Roux, J., "WHAM!: Extending Speech Separation to Noisy Environments", Interspeech, DOI: 10.21437/Interspeech.2019-2821, September 2019, pp. 1368-1372.
BibTeX TR2019-099 PDF
- @inproceedings{Wichern2019sep,
- author = {Wichern, Gordon and McQuinn, Emmett and Antognini, Joe and Flynn, Michael and Zhu, Richard and Crow, Dwight and Manilow, Ethan and Le Roux, Jonathan},
- title = {WHAM!: Extending Speech Separation to Noisy Environments},
- booktitle = {Interspeech},
- year = 2019,
- pages = {1368--1372},
- month = sep,
- doi = {10.21437/Interspeech.2019-2821},
- url = {https://www.merl.com/publications/TR2019-099}
- }
Aihara, R., Hanazawa, T., Okato, Y., Wichern, G., Le Roux, J., "Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682695, May 2019.
BibTeX TR2019-003 PDF
- @inproceedings{Aihara2019may,
- author = {Aihara, Ryo and Hanazawa, Toshiyuki and Okato, Yohei and Wichern, Gordon and Le Roux, Jonathan},
- title = {Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682695},
- url = {https://www.merl.com/publications/TR2019-003}
- }
Hori, T., Astudillo, R., Hayashi, T., Zhang, Y., Watanabe, S., Le Roux, J., "Cycle-Consistency Training for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683307, May 2019.
BibTeX TR2019-002 PDF
- @inproceedings{Hori2019may,
- author = {Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and Le Roux, Jonathan},
- title = {Cycle-Consistency Training for End-to-End Speech Recognition},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683307},
- url = {https://www.merl.com/publications/TR2019-002}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682587, May 2019.
BibTeX TR2019-008 PDF
- @inproceedings{LeRoux2019may2,
- author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {The Phasebook: Building Complex Masks via Discrete Representations for Source Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682587},
- url = {https://www.merl.com/publications/TR2019-008}
- }
Le Roux, J., Wisdom, S., Erdogan, H., Hershey, J., "SDR -- Half-Baked or Well Done?", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683855, May 2019.
BibTeX TR2019-013 PDF
- @inproceedings{LeRoux2019may,
- author = {Le Roux, Jonathan and Wisdom, Scott and Erdogan, Hakan and Hershey, John},
- title = {SDR -- Half-Baked or Well Done?},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683855},
- url = {https://www.merl.com/publications/TR2019-013}
- }
Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683510, May 2019.
BibTeX TR2019-015 PDF
- @inproceedings{Moritz2019may,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Triggered Attention for End-to-End Speech Recognition},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683510},
- url = {https://www.merl.com/publications/TR2019-015}
- }
Seetharaman, P., Wichern, G., Le Roux, J., Pardo, B., "Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683198, May 2019.
BibTeX TR2019-014 PDF
- @inproceedings{Seetharaman2019may2,
- author = {Seetharaman, Prem and Wichern, Gordon and Le Roux, Jonathan and Pardo, Bryan},
- title = {Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683198},
- url = {https://www.merl.com/publications/TR2019-014}
- }
Seetharaman, P., Wichern, G., Venkataramani, S., Le Roux, J., "Class-Conditional Embeddings for Music Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683007, May 2019.
BibTeX TR2019-004 PDF
- @inproceedings{Seetharaman2019may,
- author = {Seetharaman, Prem and Wichern, Gordon and Venkataramani, Shrikant and Le Roux, Jonathan},
- title = {Class-Conditional Embeddings for Music Source Separation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683007},
- url = {https://www.merl.com/publications/TR2019-004}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "Phasebook and Friends: Leveraging discrete representations for source separation", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2019.2904183, Vol. 13, No. 2, pp. 370-382, March 2019.
BibTeX TR2018-199 PDF
- @article{LeRoux2019mar,
- author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {Phasebook and Friends: Leveraging discrete representations for source separation},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2019,
- volume = 13,
- number = 2,
- pages = {370--382},
- month = mar,
- doi = {10.1109/JSTSP.2019.2904183},
- url = {https://www.merl.com/publications/TR2018-199}
- }
Wichern, G., Le Roux, J., "Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/IWAENC.2018.8521243, September 2018.
BibTeX TR2018-146 PDF
- @inproceedings{Wichern2018sep,
- author = {Wichern, Gordon and Le Roux, Jonathan},
- title = {Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2018,
- month = sep,
- doi = {10.1109/IWAENC.2018.8521243},
- url = {https://www.merl.com/publications/TR2018-146}
- }
Wang, Z.-Q., Le Roux, J., Wang, D., Hershey, J., "End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction", Interspeech, September 2018.
BibTeX TR2018-135 PDF
- @inproceedings{Wang2018sep,
- author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Wang, DeLiang and Hershey, John},
- title = {End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction},
- booktitle = {Interspeech},
- year = 2018,
- month = sep,
- url = {https://www.merl.com/publications/TR2018-135}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "A Purely End-to-end System for Multi-speaker Speech Recognition", Annual Meeting of the Association for Computational Linguistics (ACL), July 2018, pp. 2620-2630.
BibTeX TR2018-104 PDF Video
- @inproceedings{Seki2018jul,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
- title = {A Purely End-to-end System for Multi-speaker Speech Recognition},
- booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)},
- year = 2018,
- pages = {2620--2630},
- month = jul,
- publisher = {Elsevier},
- url = {https://www.merl.com/publications/TR2018-104}
- }
Erdogan, H., Hershey, J., Watanabe, S., Le Roux, J., "Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 7, Springer, July 2018.
BibTeX
- @incollection{Erdogan2018jul,
- author = {Erdogan, Hakan and Hershey, John and Watanabe, Shinji and Le Roux, Jonathan},
- title = {Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 7,
- month = jul,
- publisher = {Springer},
- isbn = {978-3-319-64680-0}
- }
Hershey, J., Le Roux, J., Watanabe, S., Wisdom, S., Chen, Z., Isik, Y., "Novel deep architectures in speech processing" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 6, Springer, July 9, 2018.
BibTeX
- @incollection{Hershey2018jul,
- author = {Hershey, John and Le Roux, Jonathan and Watanabe, Shinji and Wisdom, Scott and Chen, Zhuo and Isik, Yusuf},
- title = {Novel deep architectures in speech processing},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 6,
- month = jul,
- publisher = {Springer}
- }
Seki, H., Watanabe, S., Hori, T., Le Roux, J., Hershey, J.R., "An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462180, April 2018, pp. 4919-4923.
BibTeX TR2018-002 PDF Video
- @inproceedings{Seki2018apr,
- author = {Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and Le Roux, Jonathan and Hershey, John R.},
- title = {An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4919--4923},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462180},
- url = {https://www.merl.com/publications/TR2018-002}
- }
Settle, S., Le Roux, J., Hori, T., Watanabe, S., Hershey, J.R., "End-to-End Multi-Speaker Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8461893, April 2018, pp. 4819-4823.
BibTeX TR2018-001 PDF Video
- @inproceedings{Settle2018apr,
- author = {Settle, Shane and Le Roux, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {End-to-End Multi-Speaker Speech Recognition},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4819--4823},
- month = apr,
- doi = {10.1109/ICASSP.2018.8461893},
- url = {https://www.merl.com/publications/TR2018-001}
- }
Wang, Z.-Q., Le Roux, J., Hershey, J.R., "Alternative Objective Functions for Deep Clustering", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462507, April 2018, pp. 686-690.
BibTeX TR2018-005 PDF
- @inproceedings{Wang2018apr,
- author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Hershey, John R.},
- title = {Alternative Objective Functions for Deep Clustering},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {686--690},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462507},
- url = {https://www.merl.com/publications/TR2018-005}
- }