Humans excel at forming mental maps of their surroundings, equipping them to understand object relationships and navigate based on language queries. Our previous work SI Maps showed that having instance-level information and the semantic understanding of an environment helps significantly improve performance for language-guided tasks. We extend this instance-level approach to 3D while increasing the pipeline’s robustness and improving quantitative and qualitative results. Our method leverages foundational models for object recognition, image segmentation,and feature extraction. We propose a representation that results in a 3D point cloud map with instance-level embeddings, which bring in the semantic understanding that natural language commands can query. Quantitatively, the work improves upon the success rate of language-guided tasks. At the same time, we qualitatively observe the ability to identify instances more clearly and leverage the foundational models and language and image-aligned embeddings to identify objects that, otherwise, a closed-set approach wouldn’t be able to identify.
@article{doi:10.1080/01691864.2024.2395926,
author = {Nanwani,Laksh and Gupta,Kumaraditya and Mathur,Aditya and Agrawal,Swayam and Abdul Hafez,A.H. and Krishna,K. Madhava},
title = {Open-Set 3D Semantic Instance Maps for Vision Language Navigation - O3D-SIM},
journal = {Advanced Robotics},
year = {2024},
doi = {10.1080/01691864.2024.2395926},
}