Publications

Mumcu, F., Jones, M.J., Yilmaz, Y., Cherian, A., "Leveraging Multimodal LLM Descriptions of Activity for Explainable Semi-Supervised Video Anomaly Detection", Transactions on Machine Learning Research, February 2026.
BibTeX TR2026-027 PDF
- @article{Mumcu2026feb2,
- author = {Mumcu, Furkan and Jones, Michael J. and Yilmaz, Yasin and Cherian, Anoop},
- title = {{Leveraging Multimodal LLM Descriptions of Activity for Explainable Semi-Supervised Video Anomaly Detection}},
- journal = {Transactions on Machine Learning Research},
- year = 2026,
- month = feb,
- url = {https://www.merl.com/publications/TR2026-027}
- }
Hori, C., Kambara, M., Sugiura, K., Ota, K., Khurana, S., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Interactive Robot Action Replanning using Multimodal LLM Trained from Human Demonstration Videos", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49660.2025.10887717, April 2025.
BibTeX TR2025-034 PDF
- @inproceedings{Hori2025mar,
- author = {Hori, Chiori and Kambara, Motonari and Sugiura, Komei and Ota, Kei and Khurana, Sameer and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Interactive Robot Action Replanning using Multimodal {LLM} Trained from Human Demonstration Videos}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- doi = {10.1109/ICASSP49660.2025.10887717},
- url = {https://www.merl.com/publications/TR2025-034}
- }
Ni, H., Egger, B., Lohit, S., Cherian, A., Wang, Y., Koike-Akino, T., Huang, S.X., Marks, T.K., "TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2024, pp. 9015-9025.
BibTeX TR2024-059 PDF Video Software Presentation
- @inproceedings{Ni2024jun,
- author = {Ni, Haomiao and Egger, Bernhard and Lohit, Suhas and Cherian, Anoop and Wang, Ye and Koike-Akino, Toshiaki and Huang, Sharon X. and Marks, Tim K.},
- title = {{TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2024,
- pages = {9015--9025},
- month = jun,
- url = {https://www.merl.com/publications/TR2024-059}
- }
Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
BibTeX TR2022-019 PDF
- @inproceedings{Shah2022apr,
- author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and {Le Roux}, Jonathan and Hori, Chiori},
- title = {{Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7732--7736},
- month = apr,
- publisher = {IEEE},
- issn = {1520-6149},
- isbn = {978-1-6654-0540-9},
- url = {https://www.merl.com/publications/TR2022-019}
- }
Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
BibTeX TR2022-016 PDF
- @inproceedings{Hori2022feb,
- author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and {Le Roux}, Jonathan and Marks, Tim K.},
- title = {{Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10}},
- booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
- year = 2022,
- month = feb,
- url = {https://www.merl.com/publications/TR2022-016}
- }
Shah, A.P., Hori, T., Le Roux, J., Hori, C., "DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning", The 10th Dialog System Technology Challenge Workshop at AAAI 2022, February 2022.
BibTeX TR2022-025 PDF
- @inproceedings{Shah2022feb,
- author = {Shah, Ankit Parag and Hori, Takaaki and {Le Roux}, Jonathan and Hori, Chiori},
- title = {{DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning}},
- booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI 2022},
- year = 2022,
- month = feb,
- url = {https://www.merl.com/publications/TR2022-025}
- }
Shi, L., Geng, S., Shuang, K., Hori, C., Liu, S., Gao, P., Su, S., "Multi-Layer Content Interaction Through Quaternion Product For Visual Question Answering", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP40776.2020.9053595, April 2020, pp. 4412-4416.
BibTeX TR2020-046 PDF
- @inproceedings{Shi2020apr,
- author = {Shi, Lei and Geng, Shijie and Shuang, Kai and Hori, Chiori and Liu, Songxiang and Gao, Peng and Su, Sen},
- title = {{Multi-Layer Content Interaction Through Quaternion Product For Visual Question Answering}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2020,
- pages = {4412--4416},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP40776.2020.9053595},
- issn = {2379-190X},
- isbn = {978-1-5090-6631-5},
- url = {https://www.merl.com/publications/TR2020-046}
- }
Cherian, A., Wang, J., Hori, C., Marks, T.K., "Spatio-Temporal Ranked-Attention Networks for Video Captioning", IEEE Winter Conference on Applications of Computer Vision (WACV), DOI: 10.1109/WACV45572.2020.9093291, February 2020, pp. 1606-1615.
BibTeX TR2020-016 PDF
- @inproceedings{Cherian2020feb,
- author = {Cherian, Anoop and Wang, Jue and Hori, Chiori and Marks, Tim K.},
- title = {{Spatio-Temporal Ranked-Attention Networks for Video Captioning}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2020,
- pages = {1606--1615},
- month = feb,
- publisher = {IEEE},
- doi = {10.1109/WACV45572.2020.9093291},
- url = {https://www.merl.com/publications/TR2020-016}
- }
Hori, C., Alamri, H., Wang, J., Wichern, G., Hori, T., Cherian, A., Marks, T.K., Cartillier, V., Lopes, R., Das, A., Essa, I., Batra, D., Parikh, D., "End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682583, May 2019.
BibTeX TR2019-016 PDF
- @inproceedings{Hori2019may2,
- author = {Hori, Chiori and Alamri, Huda and Wang, Jue and Wichern, Gordon and Hori, Takaaki and Cherian, Anoop and Marks, Tim K. and Cartillier, Vincent and Lopes, Raphael and Das, Abhishek and Essa, Irfan and Batra, Dhruv and Parikh, Devi},
- title = {{End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682583},
- url = {https://www.merl.com/publications/TR2019-016}
- }
Alamri, H., Cartillier, V., Lopes, R., Das, A., Wang, J., Essa, I., Batra, D., Parikh, D., Cherian, A., Marks, T.K., Hori, C., "Audio Visual Scene-Aware Dialog (AVSD) Challenge at DSTC7", arXiv, July 12, 2018.
BibTeX arXiv
- @article{Alamri2018jul,
- author = {Alamri, Huda and Cartillier, Vincent and Lopes, Raphael and Das, Abhishek and Wang, Jue and Essa, Irfan and Batra, Dhruv and Parikh, Devi and Cherian, Anoop and Marks, Tim K. and Hori, Chiori},
- title = {{Audio Visual Scene-Aware Dialog (AVSD) Challenge at DSTC7}},
- journal = {arXiv},
- year = 2018,
- month = jul,
- url = {https://arxiv.org/abs/1806.00525}
- }
Hori, C., Hori, T., Marks, T.K., Hershey, J.R., "Early and Late Integration of Audio Features for Automatic Video Description", IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), DOI: 10.1109/ASRU.2017.8268968, December 2017.
BibTeX TR2017-183 PDF
- @inproceedings{Hori2017dec2,
- author = {Hori, Chiori and Hori, Takaaki and Marks, Tim K. and Hershey, John R.},
- title = {{Early and Late Integration of Audio Features for Automatic Video Description}},
- booktitle = {IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268968},
- url = {https://www.merl.com/publications/TR2017-183}
- }
Hori, C., Hori, T., Lee, T.-Y., Zhang, Z., Harsham, B.A., Sumi, K., Marks, T.K., Hershey, J.R., "Attention-Based Multimodal Fusion for Video Description", IEEE International Conference on Computer Vision (ICCV), DOI: 10.1109/ICCV.2017.450, October 2017.
BibTeX TR2017-156 PDF
- @inproceedings{Hori2017oct,
- author = {Hori, Chiori and Hori, Takaaki and Lee, Teng-Yok and Zhang, Ziming and Harsham, Bret A. and Sumi, Kazuhiko and Marks, Tim K. and Hershey, John R.},
- title = {{Attention-Based Multimodal Fusion for Video Description}},
- booktitle = {IEEE International Conference on Computer Vision (ICCV)},
- year = 2017,
- month = oct,
- doi = {10.1109/ICCV.2017.450},
- url = {https://www.merl.com/publications/TR2017-156}
- }
Porikli, F., Tuzel, O., "Covariance Tracker", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2006.
BibTeX TR2006-042 PDF
- @inproceedings{Porikli2006jun2,
- author = {Porikli, F. and Tuzel, O.},
- title = {{Covariance Tracker}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2006,
- month = jun,
- url = {https://www.merl.com/publications/TR2006-042}
- }
Ma, Z., Shao, H.-R., Shen, C., "A New Multi-path Selection Scheme for Video Streaming on Overlay Networks", IEEE International Conference on Communications (ICC), June 2004, vol. 3, pp. 1330-1334.
BibTeX TR2004-064 PDF
- @inproceedings{Ma2004jun,
- author = {Ma, Z. and Shao, H.-R. and Shen, C.},
- title = {{A New Multi-path Selection Scheme for Video Streaming on Overlay Networks}},
- booktitle = {IEEE International Conference on Communications (ICC)},
- year = 2004,
- volume = 3,
- pages = {1330--1334},
- month = jun,
- url = {https://www.merl.com/publications/TR2004-064}
- }
Xie, L., Chang, S.-F., Divakaran, A., Sun, H., "Feature Selection for Unsupervised Discovery of Statistical Temporal Structures in Video", IEEE International Conference on Image Processing (ICIP), September 2003, vol. 1, pp. 29-32.
BibTeX TR2003-116 PDF
- @inproceedings{Xie2003sep,
- author = {Xie, L. and Chang, S.-F. and Divakaran, A. and Sun, H.},
- title = {{Feature Selection for Unsupervised Discovery of Statistical Temporal Structures in Video}},
- booktitle = {IEEE International Conference on Image Processing (ICIP)},
- year = 2003,
- volume = 1,
- pages = {29--32},
- month = sep,
- url = {https://www.merl.com/publications/TR2003-116}
- }
Xie, L., Chang, S.-F., Divakaran, A., Sun, H., "Unsupervised Discovery of Multilevel Statistical Video Structures Using Hierarchical Hidden Markov Models", IEEE International Conference on Multimedia and Expo (ICME), July 2003, vol. 3, pp. 29-32.
BibTeX TR2003-101 PDF
- @inproceedings{Xie2003jul,
- author = {Xie, L. and Chang, S.-F. and Divakaran, A. and Sun, H.},
- title = {{Unsupervised Discovery of Multilevel Statistical Video Structures Using Hierarchical Hidden Markov Models}},
- booktitle = {IEEE International Conference on Multimedia and Expo (ICME)},
- year = 2003,
- volume = 3,
- pages = {29--32},
- month = jul,
- url = {https://www.merl.com/publications/TR2003-101}
- }
Zhou, J., Shao, H.-R., Shen, C., Sun, M.-T., "Multi-Path Transport of FGS Video", Packet Video (PV), April 2003.
BibTeX TR2003-10 PDF
- @inproceedings{Zhou2003apr,
- author = {Zhou, J. and Shao, H.-R. and Shen, C. and Sun, M.-T.},
- title = {{Multi-Path Transport of FGS Video}},
- booktitle = {Packet Video (PV)},
- year = 2003,
- month = apr,
- url = {https://www.merl.com/publications/TR2003-10}
- }
Lin, S., Vetro, A., Wang, Y., "Rate-Distortion Analysis of the Multiple Description Motion Compensation Video Coding Scheme", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2003, vol. 3, pp. 401-404.
BibTeX TR2003-27 PDF
- @inproceedings{Lin2003apr,
- author = {Lin, S. and Vetro, A. and Wang, Y.},
- title = {{Rate-Distortion Analysis of the Multiple Description Motion Compensation Video Coding Scheme}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2003,
- volume = 3,
- pages = {401--404},
- month = apr,
- issn = {1520-6149},
- url = {https://www.merl.com/publications/TR2003-27}
- }
Divakaran, A., Sekiguchi, S., Asai, K., Sun, H., "A Description Scheme for Video Based on Feature Extraction in the Compressed Domain", IEEE International Conference on Consumer Electronics (ICCE), June 2000, pp. 278-279.
BibTeX IEEE Xplore
- @inproceedings{Divakaran2000jun,
- author = {Divakaran, A. and Sekiguchi, S. and Asai, K. and Sun, H.},
- title = {{A Description Scheme for Video Based on Feature Extraction in the Compressed Domain}},
- booktitle = {IEEE International Conference on Consumer Electronics (ICCE)},
- year = 2000,
- pages = {278--279},
- month = jun,
- url = {https://ieeexplore.ieee.org/document/854628}
- }
Matthew Brand, "An entropic estimator for structure discovery", Tech. Rep. TR98-19, Mitsubishi Electric Research Laboratories, Cambridge, MA, September 1998.
BibTeX TR98-19 PDF
- @techreport{MERL_TR98-19,
- author = {Matthew Brand},
- title = {An entropic estimator for structure discovery},
- institution = {MERL - Mitsubishi Electric Research Laboratories},
- address = {Cambridge, MA 02139},
- number = {TR98-19},
- month = sep,
- year = 1998,
- url = {https://www.merl.com/publications/TR98-19/}
- }
Matthew Brand, "Learning concise models of human activity from ambient video via a structure-inducing M-step estimator", Tech. Rep. TR97-25, Mitsubishi Electric Research Laboratories, Cambridge, MA, November 1997.
BibTeX TR97-25 PDF
- @techreport{MERL_TR97-25,
- author = {Matthew Brand},
- title = {Learning concise models of human activity from ambient video via a structure-inducing M-step estimator},
- institution = {MERL - Mitsubishi Electric Research Laboratories},
- address = {Cambridge, MA 02139},
- number = {TR97-25},
- month = nov,
- year = 1997,
- url = {https://www.merl.com/publications/TR97-25/}
- }