Publications

19 / 3,619 publications found.


  •  Bralios, D., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "Generation or Replication: Auscultating Audio Latent Diffusion Models", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-027 PDF
    • @inproceedings{Bralios2024mar,
    • author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-027}
    • }
  •  Germain, F., Wichern, G., Le Roux, J., "Hyperbolic Unsupervised Anomalous Sound Detection", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA58266.2023.10248092, September 2023.
    BibTeX TR2023-108 PDF Video Presentation
    • @inproceedings{Germain2023aug,
    • author = {Germain, Francois and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Hyperbolic Unsupervised Anomalous Sound Detection},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2023,
    • month = sep,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA58266.2023.10248092},
    • issn = {1947-1629},
    • isbn = {979-8-3503-2372-6},
    • url = {https://www.merl.com/publications/TR2023-108}
    • }
  •  Petermann, D., Wichern, G., Subramanian, A.S., Wang, Z.-Q., Le Roux, J., "Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2023.3290428, Vol. 31, pp. 2592-2605, September 2023.
    BibTeX TR2023-113 PDF
    • @article{Petermann2023sep,
    • author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2023,
    • volume = 31,
    • pages = {2592--2605},
    • month = sep,
    • doi = {10.1109/TASLP.2023.3290428},
    • issn = {2329-9304},
    • url = {https://www.merl.com/publications/TR2023-113}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F., Le Roux, J., Watanabe, S., "BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up," Tech. Rep. TR2023-068, DCASE2023 Challenge, May 2023.
    BibTeX TR2023-068 PDF
    • @techreport{Wu2023may,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, Francois and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up},
    • institution = {DCASE2023 Challenge},
    • year = 2023,
    • month = may,
    • url = {https://www.merl.com/publications/TR2023-068}
    • }
  •  Chang, X., Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747375, April 2022, pp. 7322-7326.
    BibTeX TR2022-021 PDF
    • @inproceedings{Chang2022apr,
    • author = {Chang, Xuankai and Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan},
    • title = {Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7322--7326},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747375},
    • url = {https://www.merl.com/publications/TR2022-021}
    • }
  •  Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Sequence Transduction with Graph-based Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747788, April 2022, pp. 7212-7216.
    BibTeX TR2022-024 PDF
    • @inproceedings{Moritz2022apr,
    • author = {Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan},
    • title = {Sequence Transduction with Graph-based Supervision},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7212--7216},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747788},
    • url = {https://www.merl.com/publications/TR2022-024}
    • }
  •  Petermann, D., Wichern, G., Wang, Z.-Q., Le Roux, J., "The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746005, April 2022, pp. 526-530.
    BibTeX TR2022-022 PDF Software
    • @inproceedings{Petermann2022apr,
    • author = {Petermann, Darius and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {526--530},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9746005},
    • url = {https://www.merl.com/publications/TR2022-022}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Capturing Multi-Resolution Context by Dilated Self-Attention", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9415001, June 2021, pp. 5869-5873.
    BibTeX TR2021-036 PDF
    • @inproceedings{Moritz2021jun,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Capturing Multi-Resolution Context by Dilated Self-Attention},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {5869--5873},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9415001},
    • url = {https://www.merl.com/publications/TR2021-036}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Semi-Supervised Speech Recognition via Graph-Based Temporal Classification", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP39728.2021.9414058, June 2021, pp. 6548-6552.
    BibTeX TR2021-037 PDF
    • @inproceedings{Moritz2021jun2,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Semi-Supervised Speech Recognition via Graph-Based Temporal Classification},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2021,
    • pages = {6548--6552},
    • month = jun,
    • doi = {10.1109/ICASSP39728.2021.9414058},
    • url = {https://www.merl.com/publications/TR2021-037}
    • }
  •  Sari, L., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP40776.2020.9054249, April 2020, pp. 7384-7388.
    BibTeX TR2020-037 PDF Video Presentation
    • @inproceedings{Sari2020apr,
    • author = {Sari, Leda and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unsupervised Speaker Adaptation Using Attention-Based Speaker Memory For End-To-End ASR},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2020,
    • pages = {7384--7388},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP40776.2020.9054249},
    • issn = {2379-190X},
    • isbn = {978-1-5090-6631-5},
    • url = {https://www.merl.com/publications/TR2020-037}
    • }
  •  Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682587, May 2019.
    BibTeX TR2019-008 PDF
    • @inproceedings{LeRoux2019may2,
    • author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
    • title = {The Phasebook: Building Complex Masks via Discrete Representations for Source Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682587},
    • url = {https://www.merl.com/publications/TR2019-008}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683510, May 2019.
    BibTeX TR2019-015 PDF
    • @inproceedings{Moritz2019may,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Triggered Attention for End-to-End Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683510},
    • url = {https://www.merl.com/publications/TR2019-015}
    • }
  •  Zmolikova, K., Karafiat, M., Vesely, K., Delcroix, M., Watanabe, S., Burget, L., Cernocky, J.H., "Data selection by sequence summarizing neural network in mismatch condition training", Interspeech, DOI: 10.21437/​Interspeech.2016-741, September 2016, pp. 2354-2358.
    BibTeX TR2016-075 PDF
    • @inproceedings{Zmolikova2016sep,
    • author = {Zmolikova, Katerina and Karafiat, Martin and Vesely, Karel and Delcroix, Marc and Watanabe, Shinji and Burget, Lukas and Cernocky, Jan, Honza},
    • title = {Data selection by sequence summarizing neural network in mismatch condition training},
    • booktitle = {Interspeech},
    • year = 2016,
    • pages = {2354--2358},
    • month = sep,
    • doi = {10.21437/Interspeech.2016-741},
    • url = {https://www.merl.com/publications/TR2016-075}
    • }
  •  Vesely, K., Watanabe, S., Zmolikova, K., Karafiat, M., Burget, L., Cernocky, J.H., "Sequence Summarizing Neural Network for Speaker Adaptation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2016.7472692, March 2016, pp. 5315-5319.
    BibTeX TR2016-001 PDF
    • @inproceedings{Vesely2016mar,
    • author = {Vesely, Karel and Watanabe, Shinji and Zmolikova, Katerina and Karafiat, Martin and Burget, Lukas and Cernocky, Jan, Honza},
    • title = {Sequence Summarizing Neural Network for Speaker Adaptation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2016,
    • pages = {5315--5319},
    • month = mar,
    • doi = {10.1109/ICASSP.2016.7472692},
    • url = {https://www.merl.com/publications/TR2016-001}
    • }
  •  Sundaresan, R., Porikli, F., "Additive Noise Removal by Sparse Reconstruction on Image Affinity Nets", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2012.
    BibTeX TR2012-019 PDF
    • @inproceedings{Sundaresan2012mar,
    • author = {Sundaresan, R. and Porikli, F.},
    • title = {Additive Noise Removal by Sparse Reconstruction on Image Affinity Nets},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2012,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2012-019}
    • }
  •  Min, D., Yea, S., Arican, Z., Vetro, A., "Disparity Search Range Estimation: Enforcing Temporal Consistency", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2010.5496135, March 2010, pp. 2366-2369.
    BibTeX TR2010-013 PDF
    • @inproceedings{Min2010mar,
    • author = {Min, D. and Yea, S. and Arican, Z. and Vetro, A.},
    • title = {Disparity Search Range Estimation: Enforcing Temporal Consistency},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2010,
    • pages = {2366--2369},
    • month = mar,
    • doi = {10.1109/ICASSP.2010.5496135},
    • url = {https://www.merl.com/publications/TR2010-013}
    • }
  •  Das, S., Rane, S.D., Vetro, A., "Hiding Information Inside Structured Shapes", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2010.
    BibTeX TR2010-005 PDF
    • @inproceedings{Das2010mar,
    • author = {Das, S. and Rane, S.D. and Vetro, A.},
    • title = {Hiding Information Inside Structured Shapes},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2010,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2010-005}
    • }
  •  Vlasic, D., Adelsberger, R., Vannucci, G., Barnwell, J., Gross, M., Matusik, W., Popovic, J., "Practical Motion Capture in Everyday Surroundings", ACM Transactions on Graphics (TOG), Vol. 26, No. 3, August 2007.
    BibTeX TR2007-111 PDF
    • @article{Vlasic2007aug,
    • author = {Vlasic, D. and Adelsberger, R. and Vannucci, G. and Barnwell, J. and Gross, M. and Matusik, W. and Popovic, J.},
    • title = {Practical Motion Capture in Everyday Surroundings},
    • journal = {ACM Transactions on Graphics (TOG)},
    • year = 2007,
    • volume = 26,
    • number = 3,
    • month = aug,
    • issn = {0730-0301},
    • url = {https://www.merl.com/publications/TR2007-111}
    • }
  •  Nie, Y., Kong, H.-S., Vetro, A., Barner, K., "Fast Adaptive Fuzzy Post-Filtering for Coding Artifacts Removal in Interlaced Video", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2005, vol. 2, pp. 993-996.
    BibTeX TR2005-018 PDF
    • @inproceedings{Nie2005mar,
    • author = {Nie, Y. and Kong, H.-S. and Vetro, A. and Barner, K.},
    • title = {Fast Adaptive Fuzzy Post-Filtering for Coding Artifacts Removal in Interlaced Video},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2005,
    • volume = 2,
    • pages = {993--996},
    • month = mar,
    • issn = {1520-6149},
    • url = {https://www.merl.com/publications/TR2005-018}
    • }