• Stars
    star
    243
  • Rank 166,489 (Top 4 %)
  • Language
  • Created over 2 years ago
  • Updated 4 months ago

Reviews

There are no reviews yet. Be the first to send feedback to the community and the maintainers!

Repository Details

paper list of robotic grasping and some related works

Robotic Grasping Papers and Codes

This repo is a paper list of Robotic-Grasping and some related tasks (6D pose estimation, visual grounding, robotic manipulation, etc).

Abbreviation:

  • ICRA is IEEE International Conference on Robotics and Automation;
  • CVPR is IEEE Conference on Computer Vision and Pattern Recognition;
  • ICCV is IEEE International Conference on Computer Vision;
  • ECCV is European Conference on Computer Vision;
  • CoRL is Conference on Robot Learning;
  • NIPS is Conference on Neural Information Processing Systems;
  • RA-L is IEEE Robotics and Automation Letters;
  • Humanoids is IEEE-RAS International Conference on Humanoid Robots;
  • IJRR is The International Journal of Robotics Research;
  • IROS is IEEE/RSJ International Conference on Intelligent Robots and Systems;
  • ACM MM is ACM International Conference on Multimedia;
  • RSS is Robotics: Science and Systems;
  • T-RO is IEEE Transactions on Robotics.

1. Survey Papers

[T-RO2023] Deep Learning Approaches to Grasp Synthesis: A Review, [Project], [Paper].

Keywords: focus on 6D grasping; sampling based approaches, direct regression, using shape-completion, reinforcement learning or considering semantics.

@ARTICLE{10149823,
  author={Newbury, Rhys and Gu, Morris and Chumbley, Lachlan and Mousavian, Arsalan and Eppner, Clemens and Leitner, Jürgen and Bohg, Jeannette and Morales, Antonio and Asfour, Tamim and Kragic, Danica and Fox, Dieter and Cosgun, Akansel},
  journal={IEEE Transactions on Robotics}, 
  title={Deep Learning Approaches to Grasp Synthesis: A Review}, 
  year={2023},
  volume={},
  number={},
  pages={1-22},
  doi={10.1109/TRO.2023.3280597}
}

[arXiv2022] Robotic Grasping from Classical to Modern: A Survey, [Project], [Paper].

Keywords: overview of analytic methods and data-driven methods for Robotic grasping.

@article{zhang2022robotic,
  title={Robotic Grasping from Classical to Modern: A Survey},
  author={Zhang, Hanbo and Tang, Jian and Sun, Shiguang and Lan, Xuguang},
  journal={arXiv preprint arXiv:2202.03631},
  year={2022}
}

[Artifcial Intelligence Review (2021)] Vision‑based robotic grasping from object localization, object pose estimation to grasp estimation for parallel grippers: a review, [Paper].

Keywords: object localization, object pose estimation and grasp estimation.

@article{du2021vision,
  title={Vision-based robotic grasping from object localization, object pose estimation to grasp estimation for parallel grippers: a review},
  author={Du, Guoguang and Wang, Kai and Lian, Shiguo and Zhao, Kaiyong},
  journal={Artificial Intelligence Review},
  volume={54},
  number={3},
  pages={1677--1734},
  year={2021},
  publisher={Springer}
}

2. Related Tasks

2.1 Visual grounding

[CVPR2022] Multi-View Transformer for 3D Visual Grounding, [Paper], [Code].

Keywords: Transformer based; learn view-robust representation, eliminate the dependence on specific views.

@InProceedings{Huang_2022_CVPR,
    author    = {Huang, Shijia and Chen, Yilun and Jia, Jiaya and Wang, Liwei},
    title     = {Multi-View Transformer for 3D Visual Grounding},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {15524-15533}
}

[CVPR2022] Text2Pos: Text-to-Point-Cloud Cross-Modal Localization, [Project], [Paper], [Code].

Keywords: city-scale outdoor point cloud localizaiotn; provide KITTI360Pose dataset based on KITTI360; coarse-to-fine method, first retrieval sub-regions, then refine the position using matching-based fine localization module.

@inproceedings{dendorfer21iccv, 
  title = {Text2Pos: Text-to-Point-Cloud Cross-Modal Localization}, 
  author={Manuel Kolmet and Qunjie Zhou and Aljosa Osep and Laura Leal-Taix{'e}}, 
  booktitle = { IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
  year = {2022}, 
}

[CVPR2022] 3D-SPS: Single-Stage 3D Visual Grounding via Referred Point Progressive Selection, [Paper], [Code].

Keywords: input point cloud, RGB, normal vectors and language text; PointNet++ backbone for point cloud; output target object bounding box; single-stage method; cross-modal transformer model is used.

@InProceedings{Luo_2022_CVPR,
    author    = {Luo, Junyu and Fu, Jiahui and Kong, Xianghao and Gao, Chen and Ren, Haibing and Shen, Hao and Xia, Huaxia and Liu, Si},
    title     = {3D-SPS: Single-Stage 3D Visual Grounding via Referred Point Progressive Selection},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {16454-16463}
}

[CVPR2022] Improving Visual Grounding with Visual-Linguistic Verification and Iterative Reasoning, [Paper], [Code].

Keywords: 2D visual grounding; one-stage method; transformer-based architecture.

@inproceedings{yang2022vgvl,
  title={Improving Visual Grounding with Visual-Linguistic Verification and Iterative Reasoning},
  author={Yang, Li and Xu, Yan and Yuan, Chunfeng and Liu, Wei and Li, Bing and Hu, Weiming},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2022}
}

[arXiv2021] Looking Outside the Box to Ground Language in 3D Scenes, [Paper], [Code].

Keywords: propose BEAUTY-DETR, a transformer like architecture for 3D visual grounding; input scene point cloud, query text and object proposals generated by pretrained detector.

@article{jain2021looking,
  title={Looking Outside the Box to Ground Language in 3D Scenes},
  author={Jain, Ayush and Gkanatsios, Nikolaos and Mediratta, Ishita and Fragkiadaki, Katerina},
  journal={arXiv preprint arXiv:2112.08879},
  year={2021}
}

[CoRL2021] LanguageRefer: Spatial-Language Model for 3D Visual Grounding, [Project], [Paper], [Code].

Keywords: 3D scene point clouds; ReferIt3D dataset; transformer-based network; add viewpoint annotation, has explicit viewpoint information in the utterance.

@InProceedings{pmlr-v164-roh22a,
  title =      {LanguageRefer: Spatial-Language Model for 3D Visual Grounding},
  author =       {Roh, Junha and Desingh, Karthik and Farhadi, Ali and Fox, Dieter},
  booktitle =      {Proceedings of the 5th Conference on Robot Learning},
  pages =      {1046--1056},
  year =      {2022},
  volume =      {164},
  series =      {Proceedings of Machine Learning Research},
  publisher = {PMLR},
}

[CoRL2021] Language Grounding with 3D Objects, [Paper], [Code], [Supp].

Keywords: distinguish between object pair based on object referring expressions; annotated SNARE Dataset, based on ShapNet, 7897 objects, 50000 natural language referring expressions.

@inproceedings{thomason2022language,
  title={Language grounding with 3D objects},
  author={Thomason, Jesse and Shridhar, Mohit and Bisk, Yonatan and Paxton, Chris and Zettlemoyer, Luke},
  booktitle={Conference on Robot Learning},
  pages={1691--1701},
  year={2022},
  organization={PMLR}
}

[CVPR2021] Refer-it-in-RGBD: A Bottom-up Approach for 3D Visual Grounding in RGBD Images, [Paper], [Code], [Supp].

Keywords: new task, single view 3D visual grounding; input single-view RGBD and language text; fuse language and visual feature gradually; contribute a large-scale dataset by annotating SUNRGBD with referring expressions; GLoVe for word embedding; use GRU to encode description.

@InProceedings{Liu_2021_CVPR,
    author    = {Liu, Haolin and Lin, Anran and Han, Xiaoguang and Yang, Lei and Yu, Yizhou and Cui, Shuguang},
    title     = {Refer-It-in-RGBD: A Bottom-Up Approach for 3D Visual Grounding in RGBD Images},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2021},
    pages     = {6032-6041}
}

[ICCV2021] SAT: 2D Semantics Assisted Training for 3D Visual Grounding, [Paper], [Code], [Supp].

Keywords: 2D image assisted training, don't need 2D image in inference; auxiliary loss functions that align objects in 2D images with the corresponding ones in 3D point clouds or language queries; transformer-based method.

@InProceedings{Yang_2021_ICCV,
    author    = {Yang, Zhengyuan and Zhang, Songyang and Wang, Liwei and Luo, Jiebo},
    title     = {SAT: 2D Semantics Assisted Training for 3D Visual Grounding},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {1856-1866}
}

[ICCV2021] Free-form Description Guided 3D Visual Graph Network for Object Grounding in Point Cloud, [Paper], [Code], [Supp].

Keywords: free-form description and scene point cloud input; ScanRefer and Nr3D dataset; construct language scene graph and multi-level proposal relation graph; VoteNet for 3D object proposal; GLoVe for word embedding; use GRU to encode description.

@InProceedings{Feng_2021_ICCV,
    author    = {Feng, Mingtao and Li, Zhen and Li, Qi and Zhang, Liang and Zhang, XiangDong and Zhu, Guangming and Zhang, Hui and Wang, Yaonan and Mian, Ajmal},
    title     = {Free-Form Description Guided 3D Visual Graph Network for Object Grounding in Point Cloud},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {3722-3731}
}

[ICCV2021] 3DVG-Transformer: Relation Modeling for Visual Grounding on Point Clouds, [Paper].

Keywords: transformer based model; grounding by detection; model proposal relation to generate context-aware object proposals; leverage proposal relations to distinguish the true target object from similar proposals.

@InProceedings{Zhao_2021_ICCV,
    author    = {Zhao, Lichen and Cai, Daigang and Sheng, Lu and Xu, Dong},
    title     = {3DVG-Transformer: Relation Modeling for Visual Grounding on Point Clouds},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {2928-2937}
}

[ICCV2021] InstanceRefer: Cooperative Holistic Understanding for Visual Grounding on Point Clouds through Instance Multi-level Contextual Referring, [Paper], [Code].

Keywords: ScanRefer and ReferIt3D dataset; two-stage method, grounding-by-matching.

@InProceedings{yuan2021instancerefer,
  title={Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring},
  author={Yuan, Zhihao and Yan, Xu and Liao, Yinghong and Zhang, Ruimao and Li, Zhen and Cui, Shuguang},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={1791-1800},
  year={2021}
}

[ACM MM2021] TransRefer3D: Entity-and-Relation Aware Transformer for Fine-Grained 3D Visual Grounding, [Paper], [Code].

Keywords: transformer-based model; pointnet++ for point cloud; ReferIt3D dataset; entity-aware attention and relation-aware attention for cross-modal feature matching; two auxiliary tasks, utterance classification of the referent and object classification for better feature extraction.

@inproceedings{transrefer3d,
    title={TransRefer3D: Entity-and-Relation Aware Transformer for Fine-Grained 3D Visual Grounding},
    author={He, Dailan and Zhao, Yusheng and Luo, Junyu and Hui, Tianrui and Huang, Shaofei and Zhang, Aixi and Liu, Si},
    booktitle={Proceedings of the 29th ACM International Conference on Multimedia},
    year={2021}
}

2.2 Robotic manipulation

[CVPR2023] Learning Human-to-Robot Handovers from Point Clouds, [Project], [Paper], [Code].

Keywords: point cloud input; 6D grasp; trained by interacting with the humans in simulation environment; reinforcement learning, two-stage training scheme.

Motivation: To close the gap of human-in-the-loop policy training for human-to-robot handover, this paper introduces a vision-based learning framework for H2R handovers that is trained with a human-in-the-loop.

@inproceedings{christen2023handoversim2real,
      title = {Learning Human-to-Robot Handovers from Point Clouds},
      author = {Christen, Sammy and Yang, Wei and Pérez-D'Arpino, Claudia and Hilliges, Otmar and Fox, Dieter and Chao, Yu-Wei},
      booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
      year = {2023}
}

[CoRL2022] BEHAVIOR-1K: A Benchmark for Embodied AI with 1,000 Everyday Activities and Realistic Simulation, [Project], [Paper].

Keywords: a comprehensive simulation benchmark for human-centered robotic; 1000 everyday activities, 50 scenes, 5000+ objects.

@InProceedings{pmlr-v205-li23a,
  title = 	 {BEHAVIOR-1K: A Benchmark for Embodied AI with 1,000 Everyday Activities and Realistic Simulation},
  author =       {Li, Chengshu and Zhang, Ruohan and Wong, Josiah and Gokmen, Cem and Srivastava, Sanjana and Mart\'in-Mart\'in, Roberto and Wang, Chen and Levine, Gabrael and Lingelbach, Michael and Sun, Jiankai and Anvari, Mona and Hwang, Minjune and Sharma, Manasi and Aydin, Arman and Bansal, Dhruva and Hunter, Samuel and Kim, Kyu-Young and Lou, Alan and Matthews, Caleb R and Villa-Renteria, Ivan and Tang, Jerry Huayang and Tang, Claire and Xia, Fei and Savarese, Silvio and Gweon, Hyowon and Liu, Karen and Wu, Jiajun and Fei-Fei, Li},
  booktitle = 	 {Proceedings of The 6th Conference on Robot Learning},
  pages = 	 {80--93},
  year = 	 {2023},
  volume = 	 {205},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {14--18 Dec},
  publisher =    {PMLR},
}

[ECCV2022] DexMV: Imitation Learning for Dexterous Manipulation from Human Videos, [Project], [Paper], [Code].

Keywords: dexterous manipulation; record large-scale demonstrations of human hand conducting same tasks, and convert human motion to robot demonstrations; train imitation learning agent in simulation environemnt; benchmark multiple imitation learning algorithms with the collected demonstrations.

Motivation: To tackle complex robot dexterous manipulation tasks by imitation learning.

@inproceedings{qin2022dexmv,
  title={Dexmv: Imitation learning for dexterous manipulation from human videos},
  author={Qin, Yuzhe and Wu, Yueh-Hua and Liu, Shaowei and Jiang, Hanwen and Yang, Ruihan and Fu, Yang and Wang, Xiaolong},
  booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXIX},
  pages={570--587},
  year={2022},
  organization={Springer}
}

[CVPR2022] IFOR: Iterative Flow Minimization for Robotic Object Rearrangement, [Paper].

Keywords: input RGBD image of the original and final scenes; object rearrangement for unknown objects, handle objects with translation and planar rotations; trained on synthetic data, transfer to real-world in zero-shot manner.

@InProceedings{Goyal_2022_CVPR,
    author    = {Goyal, Ankit and Mousavian, Arsalan and Paxton, Chris and Chao, Yu-Wei and Okorn, Brian and Deng, Jia and Fox, Dieter},
    title     = {IFOR: Iterative Flow Minimization for Robotic Object Rearrangement},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {14787-14797}
}

[RA-L2022] CALVIN: A Benchmark for Language-conditioned Policy Learning for Long-horizon Robot Manipulation Tasks, [Paper], [Code].

Keywords: language conditioned long-horizon manipulation; 34 tasks; 4 simulation environments; 7-Dof Panda robot; a static camera and a robot gripper camera; RGB-D image; unstructured demonstrations datasets, ∼2.4M interaction steps.

@ARTICLE{9788026,
  author={Mees, Oier and Hermann, Lukas and Rosete-Beas, Erick and Burgard, Wolfram},
  journal={IEEE Robotics and Automation Letters}, 
  title={CALVIN: A Benchmark for Language-Conditioned Policy Learning for Long-Horizon Robot Manipulation Tasks}, 
  year={2022},
  volume={7},
  number={3},
  pages={7327-7334},
  doi={10.1109/LRA.2022.3180108}
}

[RSS2021] NeRP: Neural Rearrangement Planning for Unknown Objects, [Paper].

Keywords: multi-step object rearrangement planning, for unknown objects; input RGBD image of the original and final scenes; need to segment out unique objects in scene, compute object alignment between current and goal state; train on synthetic data.

@INPROCEEDINGS{Qureshi-RSS-21, 
    AUTHOR    = {Ahmed H Qureshi AND Arsalan Mousavian AND Chris Paxton AND Michael Yip AND Dieter Fox}, 
    TITLE     = {{NeRP: Neural Rearrangement Planning for Unknown Objects}}, 
    BOOKTITLE = {Proceedings of Robotics: Science and Systems}, 
    YEAR      = {2021}, 
    ADDRESS   = {Virtual}, 
    MONTH     = {July}, 
    DOI       = {10.15607/RSS.2021.XVII.072} 
} 

[CoRL2021] CLIPort: What and Where Pathways for Robotic Manipulation, [Project], [Paper], [Code].

Keywords: propose a two-stream architecture with semantic and spatial pathways for vision-based manipulation; propose CLIPORT, a language-conditioned imitation learning agent, can learn a single language-conditioned policy for various tabletop tasks.

@inproceedings{shridhar2021cliport,
  title     = {CLIPort: What and Where Pathways for Robotic Manipulation},
  author    = {Shridhar, Mohit and Manuelli, Lucas and Fox, Dieter},
  booktitle = {Proceedings of the 5th Conference on Robot Learning (CoRL)},
  year      = {2021},
}

[CVPR2021] ManipulaTHOR: A Framework for Visual Object Manipulation, [Project], [Paper], [Code], [Dataset].

Keywords: visual navigvation and object manipulation; simulation environment; dataset includes 30 kitchen scenes, 150+ object categories; sensors include RGB-D, GPS, agent's location and arm configuration.

@inproceedings{ehsani2021manipulathor,
  title={Manipulathor: A framework for visual object manipulation},
  author={Ehsani, Kiana and Han, Winson and Herrasti, Alvaro and VanderBilt, Eli and Weihs, Luca and Kolve, Eric and Kembhavi, Aniruddha and Mottaghi, Roozbeh},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={4497--4506},
  year={2021}
}

[ICRA2022] Audio-Visual Grounding Referring Expression for Robotic Manipulation, [Paper].

Keywords: a novel task, audio-visual grounding referring expression for robotic manipulation; establishe a dataset which contains visual data, auditory data and manipulation instructions.

@INPROCEEDINGS{9811895,
  author={Wang, Yefei and Wang, Kaili and Wang, Yi and Guo, Di and Liu, Huaping and Sun, Fuchun},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Audio-Visual Grounding Referring Expression for Robotic Manipulation}, 
  year={2022},
  pages={9258-9264},
  doi={10.1109/ICRA46639.2022.9811895}
}

[ICRA2022] StructFormer: Learning Spatial Structurefor Language-Guided Semantic Rearrangement of Novel Objects, [Paper].

Keywords: language-guided semantic rearrangement; transformer-based method; scene point cloud and structured language command input; output plan sequence, no 6D grasp.

@INPROCEEDINGS{9811931,
  author={Liu, Weiyu and Paxton, Chris and Hermans, Tucker and Fox, Dieter},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={StructFormer: Learning Spatial Structure for Language-Guided Semantic Rearrangement of Novel Objects}, 
  year={2022},
  pages={6322-6329},
  doi={10.1109/ICRA46639.2022.9811931}
}

2.3 6D pose estimation

[CVPR2022] OnePose: One-Shot Object Pose Estimation without CAD Models, [Project], [Paper], [Code], [Dataset].

Keywords: handle objects in arbitrary categories without instance or category-specific network training; release a large-scale dataset; input RGB video scan of the object and query image.

Motivation: To alleviate the demand for CAD models or category-specific training.

@article{sun2022onepose,
    title={{OnePose}: One-Shot Object Pose Estimation without {CAD} Models},
    author = {Sun, Jiaming and Wang, Zihao and Zhang, Siyu and He, Xingyi and Zhao, Hongcheng and Zhang, Guofeng and Zhou, Xiaowei},
    journal={CVPR},
    year={2022},
}

[CVPR2022] CPPF: Towards Robust Category-Level 9D Pose Estimation in the Wild, [Paper], [Code].

Keywords: category-level; point-pair features; voting method; sim-to-real transfer, trained on synthetic models, tested on real-world data, need an instance segmentation network.

@inproceedings{you2022cppf,
  title={CPPF: Towards Robust Category-Level 9D Pose Estimation in the Wild},
  author={You, Yang and Shi, Ruoxi and Wang, Weiming and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2022}
}

[RA-L2022] Estimating 6D Object Poses with Temporal Motion Reasoning for Robot Grasping in Cluttered Scenes, [Paper], [Code].

Keywords: multi-frame RGB-D sequences; YCB-Video dataset; temporal fusion, integrate the temporal motion information from RGB-D; predict stable pose sequences; handle heavy occlusion.

@article{huang2022estimating,
  title={Estimating 6D Object Poses with Temporal Motion Reasoning for Robot Grasping in Cluttered Scenes},
  author={Huang, Rui and Mu, Fengjun and Li, Wenjiang and Liu, Huaping and Cheng, Hong},
  journal={IEEE Robotics and Automation Letters},
  year={2022},
  publisher={IEEE}
}

[CVPR2019] Normalized Object Coordinate Space for Category-Level 6D Object Pose and Size Estimation, [Project], [Paper], [Code], [Supp].

Keywords: category-level 6D pose estimation; RGB-D image input; CAMERA25 and REAL275 datasets, 6 categories;

@InProceedings{Wang_2019_CVPR,
  author = {Wang, He and Sridhar, Srinath and Huang, Jingwei and Valentin, Julien and Song, Shuran and Guibas, Leonidas J.},
  title = {Normalized Object Coordinate Space for Category-Level 6D Object Pose and Size Estimation},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2019}
}

[ICRA2019] Making Sense of Vision and Touch: Self-Supervised Learning of Multimodal Representations for Contact-Rich Tasks, [Project], [Paper], [Code].

Keywords: ICRA2019 best paper award; multimodal representation learning for contact rich tasks; self-supervised representation learning; decouple representation learning and policy learning, so it can achieve practical sample efficiency on real robot.

@INPROCEEDINGS{8793485,
  author={Lee, Michelle A. and Zhu, Yuke and Srinivasan, Krishnan and Shah, Parth and Savarese, Silvio and Fei-Fei, Li and Garg, Animesh and Bohg, Jeannette},
  booktitle={2019 International Conference on Robotics and Automation (ICRA)}, 
  title={Making Sense of Vision and Touch: Self-Supervised Learning of Multimodal Representations for Contact-Rich Tasks}, 
  year={2019},
  pages={8943-8950},
  doi={10.1109/ICRA.2019.8793485}
}

[RSS2018] PoseCNN: A Convolutional Neural Network for 6D Object Pose Estimation in Cluttered Scenes, [Project], [Paper], [Code].

Keywords: RGB image input; object segmentation and 6D pose output; release YCB-video dataset, 21 objects, 92 videos, 133827 frames.

@inproceedings{xiang2018posecnn,
    Author = {Xiang, Yu and Schmidt, Tanner and Narayanan, Venkatraman and Fox, Dieter},
    Title = {PoseCNN: A Convolutional Neural Network for 6D Object Pose Estimation in Cluttered Scenes},
    Journal   = {Robotics: Science and Systems (RSS)},
    Year = {2018}
}

2.4 Datasets

[arXiv2023] Open X-Embodiment: Robotic Learning Datasets and RT-X Models, [Project], [Paper], [Dataset].

Keywords: assemble a dataset from 22 different robots, demonstrating 527 skills (160266 tasks); learn generalizable robot policies.

@misc{open_x_embodiment_rt_x_2023,
title={Open {X-E}mbodiment: Robotic Learning Datasets and {RT-X} Models},
author = {Open X-Embodiment Collaboration and Abhishek Padalkar and Acorn Pooley and Ajinkya Jain and Alex Bewley and Alex Herzog and Alex Irpan and Alexander Khazatsky and Anant Rai and Anikait Singh and Anthony Brohan and Antonin Raffin and Ayzaan Wahid and Ben Burgess-Limerick and Beomjoon Kim and Bernhard Schölkopf and Brian Ichter and Cewu Lu and Charles Xu and Chelsea Finn and Chenfeng Xu and Cheng Chi and Chenguang Huang and Christine Chan and Chuer Pan and Chuyuan Fu and Coline Devin and Danny Driess and Deepak Pathak and Dhruv Shah and Dieter Büchler and Dmitry Kalashnikov and Dorsa Sadigh and Edward Johns and Federico Ceola and Fei Xia and Freek Stulp and Gaoyue Zhou and Gaurav S. Sukhatme and Gautam Salhotra and Ge Yan and Giulio Schiavi and Hao Su and Hao-Shu Fang and Haochen Shi and Heni Ben Amor and Henrik I Christensen and Hiroki Furuta and Homer Walke and Hongjie Fang and Igor Mordatch and Ilija Radosavovic and Isabel Leal and Jacky Liang and Jaehyung Kim and Jan Schneider and Jasmine Hsu and Jeannette Bohg and Jeffrey Bingham and Jiajun Wu and Jialin Wu and Jianlan Luo and Jiayuan Gu and Jie Tan and Jihoon Oh and Jitendra Malik and Jonathan Tompson and Jonathan Yang and Joseph J. Lim and João Silvério and Junhyek Han and Kanishka Rao and Karl Pertsch and Karol Hausman and Keegan Go and Keerthana Gopalakrishnan and Ken Goldberg and Kendra Byrne and Kenneth Oslund and Kento Kawaharazuka and Kevin Zhang and Keyvan Majd and Krishan Rana and Krishnan Srinivasan and Lawrence Yunliang Chen and Lerrel Pinto and Liam Tan and Lionel Ott and Lisa Lee and Masayoshi Tomizuka and Maximilian Du and Michael Ahn and Mingtong Zhang and Mingyu Ding and Mohan Kumar Srirama and Mohit Sharma and Moo Jin Kim and Naoaki Kanazawa and Nicklas Hansen and Nicolas Heess and Nikhil J Joshi and Niko Suenderhauf and Norman Di Palo and Nur Muhammad Mahi Shafiullah and Oier Mees and Oliver Kroemer and Pannag R Sanketi and Paul Wohlhart and Peng Xu and Pierre Sermanet and Priya Sundaresan and Quan Vuong and Rafael Rafailov and Ran Tian and Ria Doshi and Roberto Martín-Martín and Russell Mendonca and Rutav Shah and Ryan Hoque and Ryan Julian and Samuel Bustamante and Sean Kirmani and Sergey Levine and Sherry Moore and Shikhar Bahl and Shivin Dass and Shuran Song and Sichun Xu and Siddhant Haldar and Simeon Adebola and Simon Guist and Soroush Nasiriany and Stefan Schaal and Stefan Welker and Stephen Tian and Sudeep Dasari and Suneel Belkhale and Takayuki Osa and Tatsuya Harada and Tatsuya Matsushima and Ted Xiao and Tianhe Yu and Tianli Ding and Todor Davchev and Tony Z. Zhao and Travis Armstrong and Trevor Darrell and Vidhi Jain and Vincent Vanhoucke and Wei Zhan and Wenxuan Zhou and Wolfram Burgard and Xi Chen and Xiaolong Wang and Xinghao Zhu and Xuanlin Li and Yao Lu and Yevgen Chebotar and Yifan Zhou and Yifeng Zhu and Ying Xu and Yixuan Wang and Yonatan Bisk and Yoonyoung Cho and Youngwoon Lee and Yuchen Cui and Yueh-hua Wu and Yujin Tang and Yuke Zhu and Yunzhu Li and Yusuke Iwasawa and Yutaka Matsuo and Zhuo Xu and Zichen Jeff Cui},
howpublished  = {\url{https://arxiv.org/abs/2310.08864}},
year = {2023},
}

[CVPR2023] Habitat-Matterport 3D Semantics Dataset, [Project], [Paper], [Code].

Keywords: the largest dataset of 3D real-world spaces with densely annotated semantics; 142646 objects; 216 3D spaces; 3100 rooms;

@InProceedings{Yadav_2023_CVPR,
    author    = {Yadav, Karmesh and Ramrakhya, Ram and Ramakrishnan, Santhosh Kumar and Gervet, Theo and Turner, John and Gokaslan, Aaron and Maestre, Noah and Chang, Angel Xuan and Batra, Dhruv and Savva, Manolis and Clegg, Alexander William and Chaplot, Devendra Singh},
    title     = {Habitat-Matterport 3D Semantics Dataset},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2023},
    pages     = {4927-4936}
}

[CVPR2023] OmniObject3D: Large-Vocabulary 3D Object Dataset for Realistic Perception, Reconstruction and Generation,[Project], [Paper], [Code].

Keywords: 6000 scanned objects, 190 categories; Each 3D object is captured with both 2D and 3D sensors, providing textured meshes, point clouds, multi-view rendered images, and multiple real-captured videos.

Motivation: Recent advances in modeling 3D objects mostly rely on synthetic datasets due to the lack of large-scale real-scanned 3D databases. To facilitate the development of 3D perception, reconstruction, and generation in the real world, this paper proposes a large-scale object dataset.

@article{wu2023omniobject3d,
  author = {Tong Wu, Jiarui Zhang, Xiao Fu, Yuxin Wang, Jiawei Ren, Liang Pan, Wayne Wu, Lei Yang, Jiaqi Wang, Chen Qian, Dahua Lin, Ziwei Liu},
  title = {OmniObject3D: Large-Vocabulary 3D Object Dataset for Realistic Perception, Reconstruction and Generation},
  journal={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2023}
}

[CVPR2023] MVImgNet: A Large-scale Dataset of Multi-view Images, [Project], [Paper].

Keywords: release a large-scale dataset of multi-view images, 6.5 million frames from 219188 videos crossing objects from 238 classes; dervive a 3D object point cloud dataset, 150 categories, 87200 samples.

Motivation: Due to the laborious collection of real-world 3D data, there is no generic dataset serving as a counterpart of ImageNet in 3D vision, this paper introduces MVImgNet.

@inproceedings{yu2023mvimgnet,
    title     = {MVImgNet: A Large-scale Dataset of Multi-view Images},
    author    = {Yu, Xianggang and Xu, Mutian and Zhang, Yidan and Liu, Haolin and Ye, Chongjie and Wu, Yushuang and Yan, Zizheng and Liang, Tianyou and Chen, Guanying and Cui, Shuguang, and Han, Xiaoguang},
    booktitle = {CVPR},
    year      = {2023}
}

[ICRA2022] Google Scanned Objects: A High-Quality Dataset of 3D Scanned Household Items, [Paper], [Dataset].

Keywords: 3D scanned objects dataset; 1030 household objects.

Motivation: Handcrafted models built from polygons and primitives correspond poorly to real objects, and real-world data collection is challenging. This paper provides a large-scale 3D scanned objects dataset for public research.

@INPROCEEDINGS{9811809,
  author={Downs, Laura and Francis, Anthony and Koenig, Nate and Kinman, Brandon and Hickman, Ryan and Reymann, Krista and McHugh, Thomas B. and Vanhoucke, Vincent},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Google Scanned Objects: A High-Quality Dataset of 3D Scanned Household Items}, 
  year={2022},
  volume={},
  number={},
  pages={2553-2560},
  doi={10.1109/ICRA46639.2022.9811809}
}

[ECCV2022] TO-Scene: A Large-scale Dataset for Understanding 3D Tabletop Scenes, [Paper], [Code].

Keywords: large-scale 3D dataset; table top scenes, contains 20740 scenes; objects(sim) from ModelNet and ShapeNet, 55 classes, 51300 models; tables(real-world) from ScanNet.

@inproceedings{xu2022toscene,
  title={TO-Scene: A Large-scale Dataset for Understanding 3D Tabletop Scenes},
  author={Xu, Mutian and Chen, Pei and Liu, Haolin and Han, Xiaoguang},
  booktitle={ECCV},
  year={2022}
}

3. Grasp Detection

3.1 General grasping

[RA-L2023] GPDAN: Grasp Pose Domain Adaptation Network for Sim-to-Real 6-DoF Object Grasping, [Paper].

Keywords: 6D grasp; sim-to-real domain adaptation, ACRONYM -> GraspNet-1Billion.

@ARTICLE{10153686,
  author={Zheng, Liming and Ma, Wenxuan and Cai, Yinghao and Lu, Tao and Wang, Shuo},
  journal={IEEE Robotics and Automation Letters}, 
  title={GPDAN: Grasp Pose Domain Adaptation Network for Sim-to-Real 6-DoF Object Grasping}, 
  year={2023},
  volume={8},
  number={8},
  pages={4585-4592},
  doi={10.1109/LRA.2023.3286816}}

[IROS2023] Multi-Source Fusion for Voxel-Based 7-DoF Grasping Pose Estimation, [Paper].

Keywords: 6D grasp.

@INPROCEEDINGS{10341840,
  author={Qiu, Junning and Wang, Fei and Dang, Zheng},
  booktitle={2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, 
  title={Multi-Source Fusion for Voxel-Based 7-DoF Grasping Pose Estimation}, 
  year={2023},
  volume={},
  number={},
  pages={968-975},
  doi={10.1109/IROS55552.2023.10341840}}

[arXiv2023] Grasp-Anything: Large-scale Grasp Dataset from Foundation Models, [Project], [Paper], [Code].

Keywords: 2D grasping; leverage knowledge from fundation models to generate a large-scale grasping dataset with 1M samples and 3M objects, substantially surpassing prior datasets in diversity and magnitude.

@misc{vuong2023graspanything,
      title={Grasp-Anything: Large-scale Grasp Dataset from Foundation Models}, 
      author={An Dinh Vuong and Minh Nhat Vu and Hieu Le and Baoru Huang and Binh Huynh and Thieu Vo and Andreas Kugi and Anh Nguyen},
      year={2023},
      eprint={2309.09818},
      archivePrefix={arXiv},
      primaryClass={cs.RO}
}

[arXiv2023] Learning Tri-mode Grasping for Ambidextrous Robot Picking, [Paper].

Keywords: ambidextrous robot picking; 6D grasp; parallel-jaw gripper grasp + suction grasp + push; cluttered scenes; cluttered scenes.

Motivation: the fusion of grasp and suction can expand the the range of objects that can be picked; the fusion of prehensile and nonprehensile action can expand the picking space of ambidextrous robot. Thus, this paper proposes Push-Grasp-Suction tri-mode grasping strategy.

@misc{zhou2023learning,
      title={Learning Tri-mode Grasping for Ambidextrous Robot Picking}, 
      author={Chenlin Zhou and Peng Wang and Wei Wei and Guangyun Xu and Fuyu Li and Jia Sun},
      year={2023},
      eprint={2302.06431},
      archivePrefix={arXiv},
      primaryClass={cs.RO}
}

[ICRA2023] GraspNeRF: Multiview-based 6-DoF Grasp Detection for Transparent and Specular Objects Using Generalizable NeRF, [Project], [Paper], [Code].

Keywords: 6D grasp; cluttered scene; for transparent and specular objects; input multi-view RGBs; leverage generalizable neural radiance field to predict TSDF.

Motivation: To tackle 6-DoF grasp detection for transparent and specular objects, propose a multi-view RGB-based network, which can achieve material-agnostic object grasping in clutter.

@article{Dai2023GraspNeRF,
  title={GraspNeRF: Multiview-based 6-DoF Grasp Detection for Transparent and Specular Objects Using Generalizable NeRF},
  author={Qiyu Dai and Yan Zhu and Yiran Geng and Ciyu Ruan and Jiazhao Zhang and He Wang},
  booktitle={IEEE International Confersence on Robotics and Automation (ICRA)},
  year={2023}
}

[ICRA2023] Keypoint-GraspNet: Keypoint-based 6-DoF Grasp Generation from the Monocular RGB-D input, [Paper].

Keywords: 6D grasp; RGB-D input; first detect grasp keypoints on RGB-D image, then recover the grasp poses with PnP algorithm; trained on synthetic dataset; faster than point cloud based methods.

Motivation: The point cloud based methods are prone to lead to failure on small objects, this paper explores 6-DoF grasp generation directly based on RGB-D image input.

@article{chen2022keypoint,
  title={Keypoint-GraspNet: Keypoint-based 6-DoF Grasp Generation from the Monocular RGB-D input},
  author={Chen, Yiye and Lin, Yunzhi and Vela, Patricio},
  journal={arXiv preprint arXiv:2209.08752},
  year={2022}
}

[ICRA2023] RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention, [Paper].

Keywords: 2D grasp; RGB-D input; Graspnet-1billion dataset; propose depth guided cross-modal attention network.

Motivation: The quality of depth maps captured by RGB-D sensors is relatively low, which makes obtaining grasping depth and multi-modal clues fusion challenging. To address the two issues, this paper proposes Depth Guided Cross-modal Attention Network.

@article{qin2023rgb,
  title={RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention},
  author={Qin, Ran and Ma, Haoxiang and Gao, Boyang and Huang, Di},
  journal={arXiv preprint arXiv:2302.14264},
  year={2023}
}

[CoRL2022] Towards Scale Balanced 6-DoF Grasp Detection in Cluttered Scenes, [Paper], [Code].

Keywords: 6D grasp; cluttered scene; Graspnet-1billion dataset; data augmentation, mix the point cloud of syntetic data and real-scene data; pretrain an unseen point cloud instance segmentation network to generate masks for all objects, and then uniformly sample points from all objects for grasp learning; balance the grasp learning on different grasp width scale.

Motivation: To address the difficulty in dealing with small-scale samples.

@InProceedings{Ma_2021_BMVC,
    author    = {Haoxiang, Ma and Huang, Di},
    title     = {Towards Scale Balanced 6-DoF Grasp Detection in Cluttered Scenes},
    booktitle = {Conference on Robot Learninsg (CoRL)},
    year      = {2022}
}

[CoRL2022] Volumetric-based Contact Point Detection for 7-DoF Grasping, [Paper], [Code].

Keywords: 6D grasp; cluttered scenes; trained on synthetic data; TSDF-based; pipeline, multi-view fusion, contact-point sampling, evaluation and collision checking.

@inproceedings{cai2022volumetric,
    title     = {Volumetric-based Contact Point Detection for 7-DoF Grasping},
    author    = {Cai, Junhao and Su, Jingcheng and Zhou, Zida and Cheng, Hui and Chen, Qifeng and Wang, Michael Yu},
    booktitle={Conference on Robot Learning (CoRL)},
    year={2022},
    organization={PMLR}
}

[RA-L2022] DA2 Dataset: Toward Dexterity-Aware Dual-Arm Grasping, [Project], [Paper], [Code].

Keywords: 6D grasp; single object; dual-arm grasping for large objects in simulation; release a large scale dual-arm grasping dataset, 6327 objects, 9M grasp pairs.

@article{da2dataset,
  author={Zhai, Guangyao and Zheng, Yu and Xu, Ziwei and Kong, Xin and Liu, Yong and Busam, Benjamin and Ren, Yi and Navab, Nassir and Zhang, Zhengyou},
  journal={IEEE Robotics and Automation Letters}, 
  title={DA$^2$ Dataset: Toward Dexterity-Aware Dual-Arm Grasping}, 
  year={2022},s
  volume={7},
  number={4},
  pages={8941-8948},
  doi={10.1109/LRA.2022.3189959}
}

[RA-L2022] End-to-End Learning to Grasp via Sampling From Object Point Clouds, [Paper], [Code].

Keywords: 6D grasp; single object; point cloud input; combines a differentiable sampling strategy to identify the visible contact points, then use classifier and regressor to predict other contact point and grasp angle.

@ARTICLE{9830843,
  author={Alliegro, Antonio and Rudorfer, Martin and Frattin, Fabio and Leonardis, Aleš and Tommasi, Tatiana},
  journal={IEEE Robotics and Automation Letters}, 
  title={End-to-End Learning to Grasp via Sampling From Object Point Clouds}, 
  year={2022},
  volume={7},
  number={4},
  pages={9865-9872},
  doi={10.1109/LRA.2022.3191183}
}

[RA-L2022] EfficientGrasp: A Unified Data-Efficient Learning to Grasp Method for Multi-Fingered Robot Hands, [Paper].

Keywords: single object grasping; multi-finger gripper; generalize to different types of robotic grippers; uses fingertip workspace points set as the gripper attribute input, detect the contact points on object point cloud.

@ARTICLE{9813387,
  author={Li, Kelin and Baron, Nicholas and Zhang, Xian and Rojas, Nicolas},
  journal={IEEE Robotics and Automation Letters}, 
  title={EfficientGrasp: A Unified Data-Efficient Learning to Grasp Method for Multi-Fingered Robot Hands}, 
  year={2022},
  volume={7},
  number={4},
  pages={8619-8626},
  doi={10.1109/LRA.2022.3187875}
}

[RA-L2022] SymmetryGrasp: Symmetry-Aware Antipodal Grasp Detection From Single-View RGB-D Images, [Paper].

Keywords: 6D grasp; input RGBD image; single view; Mask-RCNN for symmetric region detection on RGB-D image, tranform the RGBD region to point cloud and aplly PointNet++ for grasp detection.

@ARTICLE{9919329,
  author={Shi, Yifei and Tang, Zixin and Cai, Xiangting and Zhang, Hongjia and Hu, Dewen and Xu, Xin},
  journal={IEEE Robotics and Automation Letters}, 
  title={SymmetryGrasp: Symmetry-Aware Antipodal Grasp Detection From Single-View RGB-D Images}, 
  year={2022},
  volume={7},
  number={4},
  pages={12235-12242},
  doi={10.1109/LRA.2022.3214785}
}

[ECCV2022] Domain Randomization-Enhanced Depth Simulation and Restoration for Perceiving and Grasping Specular and Transparent Objects, [Paper], [Code].

Keywords: Depth restoration for robotic grasping; Swin-Tiny backbone for depth restoration, two-stream net for rgb and depth feature extraction; graspnet-baseline for 6D grasp.

@inproceedings{dai2022dreds,
    title={Domain Randomization-Enhanced Depth Simulation and Restoration for Perceiving and Grasping Specular and Transparent Objects},
    author={Dai, Qiyu and Zhang, Jiyao and Li, Qiwei and Wu, Tianhao and Dong, Hao and Liu, Ziyuan and Tan, Ping and Wang, He},
    booktitle={European Conference on Computer Vision (ECCV)},
    year={2022}
}

[ECCV2022] TransGrasp: Grasp Pose Estimation of a Category of Objects by Transferring Grasps from Only One Labeled Instance, [Paper], [Code].

Keywords: 6D grasp; single object point cloud; from one instance to one category; 3 categories, objects model from ShapeNetCore; metric is grasp success rate in simulation environment; compare with GPD and 6-DOF GraspNet.

@inproceedings{wen2022transgrasp,
  title={TransGrasp: Grasp Pose Estimation of a Category of Objects by Transferring Grasps from Only One Labeled Instance},
  author={Wen, Hongtao and Yan, Jianhang and Peng, Wanli and Sun, Yi},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2022}
}

[ICRA2022] Hybrid Physical Metric For 6-DoF Grasp Pose Detection, [Paper], [Code].

Keywords: 6D grasp; cluttered scene; real-world data; propose a new grasp score based on Graspnet-1billion, take force-closure metric, object flatness, gravity and collision into consideration.

@INPROCEEDINGS{9811961,
  author={Lu, Yuhao and Deng, Beixing and Wang, Zhenyu and Zhi, Peiyuan and Li, Yali and Wang, Shengjin},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Hybrid Physical Metric For 6-DoF Grasp Pose Detection}, 
  year={2022},
  pages={8238-8244},
  doi={10.1109/ICRA46639.2022.9811961}
}

[ICRA2022] Context-Aware Grasp Generation in Cluttered Scenes, [Paper].

Keywords: 6D grasp; Graspnet-1billion dataset; cluttered scene; real-world data; pointnet++ backbone, vote and cluster seed points; self-attention mechanism for context learning.

@INPROCEEDINGS{9811371,
  author={Hoang, Dinh-Cuong and Stork, Johannes A. and Stoyanov, Todor},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Context-Aware Grasp Generation in Cluttered Scenes}, 
  year={2022},
  pages={1492-1498},
  doi={10.1109/ICRA46639.2022.9811371}
}

[RA-L2022] Real-Time Collision-Free Grasp Pose Detection With Geometry-Aware Refinement Using High-Resolution Volume, [Project], [Paper].

Keywords: 6D grasp; cluttered scene; multi-frame depth maps are integrated to get TSDF volume; use a light-weight volume-point network to extract 3D features.

@ARTICLE{9681231,
  author={Cai, Junhao and Cen, Jun and Wang, Haokun and Wang, Michael Yu},
  journal={IEEE Robotics and Automation Letters}, 
  title={Real-Time Collision-Free Grasp Pose Detection With Geometry-Aware Refinement Using High-Resolution Volume}, 
  year={2022},
  volume={7},
  number={2},
  pages={1888-1895},
  doi={10.1109/LRA.2022.3142424}
}

[arXiv2022] A Robotic Visual Grasping Design: Rethinking Convolution Neural Network with High-Resolutions, [Paper], [Code].

Keywords: 2D planar grasp; Cornell and Jacquard grasping datasets; high-resolution CNN for feature extraction; D/RGB/RGB-D input.

@article{zhou2022robotic,
  title={A Robotic Visual Grasping Design: Rethinking Convolution Neural Network with High-Resolutions},
  author={Zhou, Zhangli and Wang, Shaochen and Chen, Ziyang and Cai, Mingyu and Kan, Zhen},
  journal={arXiv preprint arXiv:2209.07459},
  year={2022}
}

[RA-L2022] When Transformer Meets Robotic Grasping: Exploits Context for Efficient Grasp Detection, [Paper], [Code].

Keywords: 2D planar grasp; Cornell and Jacquard grasping datasets; cluttered scene; Transformer based architecture; D/RGB/RGB-D input.

@ARTICLE{9810182,
  author={Wang, Shaochen and Zhou, Zhangli and Kan, Zhen},
  journal={IEEE Robotics and Automation Letters}, 
  title={When Transformer Meets Robotic Grasping: Exploits Context for Efficient Grasp Detection}, 
  year={2022},
  volume={7},
  number={3},
  pages={8170-8177},
  doi={10.1109/LRA.2022.3187261}
}

[ICCV2021] Graspness Discovery in Clutters for Fast and Accurate Grasp Detection, [Paper], [Code(non-official)], [Supp].

Keywords: 6D general grasp; cluttered scene; real-world dataset GraspNet-1billion; single-view scene point cloud input; MinkowskiEngine sparse convolution, ResUNet14.

@InProceedings{Wang_2021_ICCV,
    author    = {Wang, Chenxi and Fang, Hao-Shu and Gou, Minghao and Fang, Hongjie and Gao, Jin and Lu, Cewu},
    title     = {Graspness Discovery in Clutters for Fast and Accurate Grasp Detection},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {15964-15973}
}

[IROS2021] Simultaneous Semantic and Collision Learning for 6-DoF Grasp Pose Estimation, [paper].

Keywords: 6D grasp; cluttered scene; single-view scene point cloud input; real-world dataset GraspNet-1billion; jointly predict grasp poses, semantic segmentation and collision detection.

@inproceedings{li2021simultaneous,
  title={Simultaneous Semantic and Collision Learning for 6-DoF Grasp Pose Estimation},
  author={Li, Yiming and Kong, Tao and Chu, Ruihang and Li, Yifeng and Wang, Peng and Li, Lei},
  booktitle={2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
  pages={3571--3578},
  year={2021},
  organization={IEEE}
}

[ICRA2021] RGB Matters: Learning 7-DoF Grasp Poses on Monocular RGBD Images, [Paper].

Keywords: 6D genral grasp; cluttered scene; RGB and single-view point cloud input; real-world dataset GraspNet-1billion.

@INPROCEEDINGS{9561409,
  author={Gou, Minghao and Fang, Hao-Shu and Zhu, Zhanda and Xu, Sheng and Wang, Chenxi and Lu, Cewu},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={RGB Matters: Learning 7-DoF Grasp Poses on Monocular RGBD Images}, 
  year={2021},
  pages={13459-13466},
  doi={10.1109/ICRA48506.2021.9561409}
}

[RA-L2021] SuctionNet-1Billion: A Large-Scale Benchmark for Suction Grasping, [Project], [Paper], [Code].

Keywords: suction; cluttered scene; RGBD input; release a large-scale real-world suction datatset.

@ARTICLE{9547830,
  author={Cao, Hanwen and Fang, Hao-Shu and Liu, Wenhai and Lu, Cewu},
  journal={IEEE Robotics and Automation Letters}, 
  title={SuctionNet-1Billion: A Large-Scale Benchmark for Suction Grasping}, 
  year={2021},
  volume={6},
  number={4},
  pages={8718-8725},
  doi={10.1109/LRA.2021.3115406}
}

[RSS2021] Synergies Between Affordance and Geometry: 6-DoF Grasp Detection via Implicit Representations, [Project], [Paper], [Code].

Keywords: cluttered scene; 6D grasp; multi-task learning, 3D reconstruction and grasp detection; train the modelon self-supervised grasp trials data in simulation.

@article{jiang2021synergies,
 author = {Jiang, Zhenyu and Zhu, Yifeng and Svetlik, Maxwell and Fang, Kuan and Zhu, Yuke},
 journal = {Robotics: science and systems},
 title = {Synergies Between Affordance and Geometry: 6-DoF Grasp Detection via Implicit Representations},
 year = {2021}
}

[ICRA2021] GPR: Grasp Pose Refinement Network for Cluttered Scenes, [Paper].

Keywords: 6D grasp; two-stage method; self-made dataset in simulation; cluttered scene in simulation; PointNet++ backbone.

@inproceedings{wei2021gpr,
  title={Gpr: Grasp pose refinement network for cluttered scenes},
  author={Wei, Wei and Luo, Yongkang and Li, Fuyu and Xu, Guangyun and Zhong, Jun and Li, Wanyi and Wang, Peng},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={4295--4302},
  year={2021},
  organization={IEEE}
}

[ICRA2021] Contact-GraspNet: Efficient 6-DoF Grasp Generation in Cluttered Scenes, [Project], [Paper], [Code].

Keywords: 6D grasp; object grasp dataset is ACRONYM; cluttered scene in simulation; single-view scene point cloud(20000 points) input; backbone based on PointNet++.

@inproceedings{sundermeyer2021contact,
  title={Contact-graspnet: Efficient 6-dof grasp generation in cluttered scenes},
  author={Sundermeyer, Martin and Mousavian, Arsalan and Triebel, Rudolph and Fox, Dieter},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={13438--13444},
  year={2021},
  organization={IEEE}
}

[ICRA2021] Robotic Grasping through Combined Image-Based Grasp Proposal and 3D Reconstruction, [Paper].

Keywords: input RGB-D image; single object; 6D grasp; multi-task learning, point cloud reconstruction and grasp generation.

@INPROCEEDINGS{9562046,
  author={Yang, Daniel and Tosun, Tarik and Eisner, Benjamin and Isler, Volkan and Lee, Daniel},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Robotic Grasping through Combined Image-Based Grasp Proposal and 3D Reconstruction}, 
  year={2021},
  volume={},
  number={},
  pages={6350-6356},
  doi={10.1109/ICRA48506.2021.9562046}
}

[ICRA2021] REGNet: REgion-based Grasp Network for End-to-end Grasp Detection in Point Clouds, [Paper].

Keywords: input point cloud; PointNet++ backbone; 6D grasp; single pbject; 3-stage single-shot network, Score Network (SN), Grasp Region Network (GRN) and Refine Network (RN).

@INPROCEEDINGS{9561920,
  author={Zhao, Binglei and Zhang, Hanbo and Lan, Xuguang and Wang, Haoyu and Tian, Zhiqiang and Zheng, Nanning},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={REGNet: REgion-based Grasp Network for End-to-end Grasp Detection in Point Clouds}, 
  year={2021},
  pages={13474-13480},
  doi={10.1109/ICRA48506.2021.9561920}
}

[ICRA2021] Acronym: A large-scale grasp dataset based on simulation, [Project], [Paper], [Code].

Keywords: 6D grasp; release a grasp dataset in simulation; 8872 objects, 262 categories, 17.7M grasps; in addition to single object, acronym also contains scenes with structured clutter.

@inproceedings{eppner2021acronym,
  title={ACRONYM: A large-scale grasp dataset based on simulation},
  author={Eppner, Clemens and Mousavian, Arsalan and Fox, Dieter},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={6222--6227},
  year={2021},
  organization={IEEE}
}

[CVPR2020] GraspNet-1Billion: A Large-Scale Benchmark for General Object Grasping, [Project&&Dataset], [Paper], [Code], [Supp].

Keywords: 6D general grasp; release a large-scale real-world dataset; cluttered scene; single-view scene point cloud input; PointNet++ backbone.

@inproceedings{fang2020graspnet,
  title={GraspNet-1Billion: A Large-Scale Benchmark for General Object Grasping},
  author={Fang, Hao-Shu and Wang, Chenxi and Gou, Minghao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={11444--11453},
  year={2020}
}

[NIPS2020] Grasp Proposal Networks: An End-to-End Solution for Visual Learning of Robotic Grasps, [Paper], [Code], [Supp].

Keywords: 6D grasp; single object; single-view point cloud input; PointNet++ backbone; self-made synthetic dataset based on ShapeNetSem in simulation.

@inproceedings{NEURIPS2020_994d1cad,
 author = {Wu, Chaozheng and Chen, Jian and Cao, Qiaoyu and Zhang, Jianchi and Tai, Yunxin and Sun, Lin and Jia, Kui},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 pages = {13174--13184},
 publisher = {Curran Associates, Inc.},
 title = {Grasp Proposal Networks: An End-to-End Solution for Visual Learning of Robotic Grasps},
 volume = {33},
 year = {2020}
}

[CoRL2020] Volumetric Grasping Network: Real-time 6 DOF Grasp Detection in Clutter, [Paper], [Code].

Keywords: 6D grasp; Truncated Signed Distance Function (TSDF) representation of the scene; cluttered scene; trained on a synthetic grasping dataset generated with physics simulation.

@inproceedings{breyer2020volumetric,
 title={Volumetric Grasping Network: Real-time 6 DOF Grasp Detection in Clutter},
 author={Breyer, Michel and Chung, Jen Jen and Ott, Lionel and Roland, Siegwart and Juan, Nieto},
 booktitle={Conference on Robot Learning},
 year={2020},
}

[CoRL2020] GDN: A Coarse-To-Fine (C2F) Representation for End-To-End 6-DoF Grasp Detection, [Paper].

Keywords: 6D grasp; one-stage method; single object; PointNet++ backbone; self-made dataset based on YCB.

@InProceedings{pmlr-v155-jeng21a,
  title =      {GDN: A Coarse-To-Fine (C2F) Representation for End-To-End 6-DoF Grasp Detection},
  author =       {Jeng, Kuang-Yu and Liu, Yueh-Cheng and Liu, Zhe Yu and Wang, Jen-Wei and Chang, Ya-Liang and Su, Hung-Ting and Hsu, Winston},
  booktitle =      {Proceedings of the 2020 Conference on Robot Learning},
  pages =      {220--231},
  year =      {2021},
  volume =      {155},
  series =      {Proceedings of Machine Learning Research},
  publisher =    {PMLR},
}

[ICRA2020] Learning to Generate 6-DoF Grasp Poses with Reachability Awareness, [Paper].

Keywords: 6D grasp; cluttered scene; sampling based grasp generation; point cloud voxelization, 3D CNN; grasp pose should be stable and reachable; train on synthetic dataset; self-supervised data collection.

@INPROCEEDINGS{9197413,
  author={Lou, Xibai and Yang, Yang and Choi, Changhyun},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Learning to Generate 6-DoF Grasp Poses with Reachability Awareness}, 
  year={2020},
  volume={},
  number={},
  pages={1532-1538},
  doi={10.1109/ICRA40945.2020.9197413}
}

[ICRA2020] PointNet++ Grasping: Learning An End-to-end Spatial Grasp Generation Algorithm from Sparse Point Clouds, [Paper].

Keywords: end-to-end approach, directly predict grasp; 6D grasp; PointNet++ backbone; single/multi-object scene; point cloud input.

@inproceedings{ni2020pointnet++,
  title={Pointnet++ grasping: learning an end-to-end spatial grasp generation algorithm from sparse point clouds},
  author={Ni, Peiyuan and Zhang, Wenguang and Zhu, Xiaoxiao and Cao, Qixin},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={3619--3625},
  year={2020},
  organization={IEEE}
}

[ICRA2020] Real-Time, Highly Accurate Robotic Grasp Detection using Fully Convolutional Neural Network with Rotation Ensemble Module, [Paper].

Keywords: 2D grasp; RGB input; Cornell dataset.

@inproceedings{park2020real,
  title={Real-time, highly accurate robotic grasp detection using fully convolutional neural network with rotation ensemble module},
  author={Park, Dongwon and Seo, Yonghyeok and Chun, Se Young},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={9397--9403},
  year={2020},
  organization={IEEE}
}

[CoRL2020] S4G: Amodal Single-view Single-Shot SE(3) Grasp Detection in Cluttered Scenes, [Project], [Paper], [Code].

Keywords: 6D grasp; cluttered scene; single-view point cloud input; one-stage grasp prediction; train on synthetic data.

@inproceedings{qin2020s4g,
  title={S4g: Amodal Single-View Single-Shot SE(3) Grasp Detection in Cluttered Scenes},
  author={Qin, Yuzhe and Chen, Rui and Zhu, Hao and Song, Meng and Xu, Jing and Su, Hao},
  booktitle={Conference on Robot Learning},
  pages={53--65},
  year={2020},
  organization={PMLR}
}

[IROS2020] Antipodal Robotic Grasping using Generative Residual Convolutional Neural Network, [Paper], [Code].

Keywords: 2D grasp; cluttered scene; input RGB/D/RGB-D; Cornell dataset and Jacuard dataset.

@INPROCEEDINGS{9340777,
  author={Kumra, Sulabh and Joshi, Shirin and Sahin, Ferat},
  booktitle={2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, 
  title={Antipodal Robotic Grasping using Generative Residual Convolutional Neural Network}, 
  year={2020},
  pages={9626-9633},
  doi={10.1109/IROS45743.2020.9340777}
}

[ICRA2020] Using Synthetic Data and Deep Networks to Recognize Primitive Shapes for Object Grasping, [Paper], [Code].

Keywords: depth input; segment object into primitive shape classes, transform the predefined grasps on each primitive shape class to object.

@INPROCEEDINGS{9197256,
  author={Lin, Yunzhi and Tang, Chao and Chu, Fu-Jen and Vela, Patricio A.},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Using Synthetic Data and Deep Networks to Recognize Primitive Shapes for Object Grasping}, 
  year={2020},
  pages={10494-10501},
  doi={10.1109/ICRA40945.2020.9197256}
}

[ICRA2020] Action Image Representation: Learning Scalable Deep Grasping Policies with Zero Real World Data, [Paper].

Keywords: 2D grasp; sampling-based grasp generation; a new grasp proposal representation.

@INPROCEEDINGS{9197415,
  author={Khansari, Mohi and Kappler, Daniel and Luo, Jianlan and Bingham, Jeff and Kalakrishnan, Mrinal},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Action Image Representation: Learning Scalable Deep Grasping Policies with Zero Real World Data}, 
  year={2020},
  pages={3597-3603},
  doi={10.1109/ICRA40945.2020.9197415}
}

[IJRR2020] Learning robust, real-time, reactive robotic grasping, [Paper], [Code].

Keywords: GG-CNN; depth image input.

@article{doi:10.1177/0278364919859066,
  author = {Douglas Morrison and Peter Corke and Jürgen Leitner},
  title ={Learning robust, real-time, reactive robotic grasping},
  journal = {The International Journal of Robotics Research},
  volume = {39},
  number = {2-3},
  pages = {183-201},
  year = {2020},
  doi = {10.1177/0278364919859066},
}

[ICCV2019] 6-DOF GraspNet: Variational Grasp Generation for Object Manipulation, [Paper], [Code].

Keywords: 6D grasp; sampling then evaluation; PointNet++ backbone; generate dataset in simulation; single object point cloud and grasp input.

@inproceedings{mousavian20196,
  title={6-dof graspnet: Variational grasp generation for object manipulation},
  author={Mousavian, Arsalan and Eppner, Clemens and Fox, Dieter},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={2901--2910},
  year={2019}
}

[ICRA2019] PointNetGPD: Detecting Grasp Configurations from Point Set, [Paper], [Code].

Keywords: 6D grasp; sampling then evaluation; single object cloud and grasp input; PointNet backbone; generate a large grasp dataset with YCB object set.

@inproceedings{liang2019pointnetgpd,
  title={Pointnetgpd: Detecting grasp configurations from point sets},
  author={Liang, Hongzhuo and Ma, Xiaojian and Li, Shuang and G{\"o}rner, Michael and Tang, Song and Fang, Bin and Sun, Fuchun and Zhang, Jianwei},
  booktitle={2019 International Conference on Robotics and Automation (ICRA)},
  pages={3629--3635},
  year={2019},
  organization={IEEE}
}

[RSS2018] Closing the Loop for Robotic Grasping: A Real-time, Generative Grasp Synthesis Approach, [Paper], [Code].

@inproceedings{morrison2018closing,
	title={{Closing the Loop for Robotic Grasping: A Real-time, Generative Grasp Synthesis Approach}},
	author={Morrison, Douglas and Corke, Peter and Leitner, J\"urgen},
	booktitle={Proc.\ of Robotics: Science and Systems (RSS)},
	year={2018}
}

[RSS2017] Dex-Net 2.0: Deep Learning to Plan Robust Grasps with Synthetic Point Clouds and Analytic Grasp Metrics, [Paper], [Code].

@inproceedings{mahler2017dex,
	title="Dex-Net 2.0: Deep Learning to Plan Robust Grasps with Synthetic Point Clouds and Analytic Grasp Metrics",
	author="Jeffrey {Mahler} and Jacky {Liang} and Sherdil {Niyaz} and Michael {Laskey} and Richard {Doan} and Xinyu {Liu} and Juan {Aparicio} and Ken {Goldberg}",
	booktitle="Robotics: Science and Systems (RSS)",
	volume="13",
	notes="Sourced from Microsoft Academic - https://academic.microsoft.com/paper/2600030077",
	year="2017"
}

[IJRR2017] Grasp Pose Detection in Point Clouds, [Paper], [Code].

Keywords: 6D grasp; point cloud input; CNN based method.

@article{ten2017grasp,
  title={Grasp pose detection in point clouds},
  author={ten Pas, Andreas and Gualtieri, Marcus and Saenko, Kate and Platt, Robert},
  journal={The International Journal of Robotics Research},
  volume={36},
  number={13-14},
  pages={1455--1473},
  year={2017},
  publisher={SAGE Publications Sage UK: London, England}
}

[IJRR2015] Deep Learning for Detecting Robotic Grasps, [Paper].

Keywords: 2D grasp; cluttered scene.

@article{lenz2015deep,
  title={Deep learning for detecting robotic grasps},
  author={Lenz, Ian and Lee, Honglak and Saxena, Ashutosh},
  journal={The International Journal of Robotics Research},
  volume={34},
  number={4-5},
  pages={705--724},
  year={2015},
  publisher={SAGE Publications Sage UK: London, England}
}

[ICRA2021] Efficient grasping from RGBD images: Learning using a new rectangle representation, [Paper].

@INPROCEEDINGS{5980145,
  author={Yun Jiang and Moseson, Stephen and Saxena, Ashutosh},
  booktitle={2011 IEEE International Conference on Robotics and Automation}, 
  title={Efficient grasping from RGBD images: Learning using a new rectangle representation}, 
  year={2011},
  volume={},
  number={},
  pages={3304-3311},
  doi={10.1109/ICRA.2011.5980145}
}

3.2 Dexterous grasping

[arXiv2023] Generalized Anthropomorphic Functional Grasping with Minimal Demonstrations, [Paper].

Keywords: dexterous grasping; functional grasp; learn from human grasp demonstration for category-level objects; object reconstruction -> variational grasp sampler -> iterative grasp refinement; 10k synthesized functional grasp dataset.

@misc{wei2023generalized,
    title={Generalized Anthropomorphic Functional Grasping with Minimal Demonstrations},
    author={Wei Wei and Peng Wang and Sizhe Wang},
    year={2023},
    eprint={2303.17808},
    archivePrefix={arXiv},
    primaryClass={cs.RO}
}

[ICRA2023] DexGraspNet: A Large-Scale Robotic Dexterous Grasp Dataset for General Objects Based on Simulation, [Project], [Paper], [Code], [Dataset].

Keywords: dexterous grasping, ShadowHand; release a large-scale dexterous grasping dataset in simulation, 5355 objects, 133 categories, 1.32M grasps.

Motivation: Dexterous grasping is much more under-explored than parallel grasping, partially due to the lack of a large-scale dataset. To accelerate the study of dexterous object manipulation, propose a large-scale grasping dataset.

@article{wang2022dexgraspnet,
  title={DexGraspNet: A Large-Scale Robotic Dexterous Grasp Dataset for General Objects Based on Simulation},
  author={Wang, Ruicheng and Zhang, Jialiang and Chen, Jiayi and Xu, Yinzhen and Li, Puhao and Liu, Tengyu and Wang, He},
  journal={arXiv preprint arXiv:2210.02697},
  year={2022}
}

[ICRA2023] GenDexGrasp: Generalizable Dexterous Grasping, [Project], [Paper], [Code].

Keywords: 6D grasp; single object; multi-hand grasp; first first generate hand-agnostic contact map for the given object, then optimize the hand pose to match the generated contact map; propose a synthetic large-scale multi-hand grasping dataset.

Motivation: Existing methods mostly focus on a specific type of robot hand, and oftentimes fail to rapidly generate diverse grasps with a high success rate. This paper leverages the contact map as a hand-agnostic intermediate representation and transfers among diverse multi-fingered robotic hands.

@article{li2022gendexgrasp,
  title={GenDexGrasp: Generalizable Dexterous Grasping},
  author={Li, Puhao and Liu, Tengyu and Li, Yuyang and Zhu, Yixin and Yang, Yaodong and Huang, Siyuan},
  journal={arXiv preprint arXiv:2210.00722},
  year={2022}
}

[ICRA2022] HGC-Net: Deep Anthropomorphic Hand Grasping in Clutter, [Paper], [Code].

Keywords: 6D grasp; cluttered scenes; dexterous grasping; single-view point cloud input; train on sythetic dataset.

@INPROCEEDINGS{9811756,
  author={Li, Yiming and Wei, Wei and Li, Daheng and Wang, Peng and Li, Wanyi and Zhong, Jun},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={HGC-Net: Deep Anthropomorphic Hand Grasping in Clutter}, 
  year={2022},
  volume={},
  number={},
  pages={714-720},
  doi={10.1109/ICRA46639.2022.9811756}
}

3.3 Semantic grasping

[CoRL2023] Language-guided Robot Grasping: CLIP-based Referring Grasp Synthesis in Clutter, [Paper].

Keywords: 2D grasp; RGB-Text input; develop a benchmark called OCID-VLG based on cluttered indoor scenes from OCID dataset; propose an end-to-end model to learn grasp synthesis directly from image-text pairs.

@inproceedings{tziafas2023language,
  title={Language-guided Robot Grasping: CLIP-based Referring Grasp Synthesis in Clutter},
  author={Tziafas, Georgios and Yucheng, XU and Goel, Arushi and Kasaei, Mohammadreza and Li, Zhibin and Kasaei, Hamidreza},
  booktitle={7th Annual Conference on Robot Learning},
  year={2023}
}

[ICRA2023] A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter, [Paper].

Keywords: language-guided task-oriented grasping; 6D grasp; cluttered scene; object-centric representation, a joint modeling of vision, language and grasp through cross-attention module; incorporate model-free reinforcement learning for obstacle removal and target object grasping; utilize priors from pre-trained CLIP and grasp model to improve the sample efficiency and alleviate the sim2real problem.

@article{xu2023joint,
  title={A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter},
  author={Xu, Kechun and Zhao, Shuqi and Zhou, Zhongxiang and Li, Zizhang and Pi, Huaijin and Zhu, Yifeng and Wang, Yue and Xiong, Rong},
  journal={arXiv preprint arXiv:2302.12610},
  year={2023}
}

[arXiv2023] Learning 6-DoF Fine-grained Grasp Detection Based on Part Affordance Grounding, [Project], [Paper].

Keywords: single object; 6D grasp; fine-grained, task-oriented, language-guided grasp; propose a large language-guided shape grasping dataset, 16.6k objects of 16 categories in simulation environement; part affordance grounding and grasp stability evaluation; sampling-then-evaluation method.

@article{song2023learning,
  title={Learning 6-DoF Fine-grained Grasp Detection Based on Part Affordance Grounding},
  author={Song, Yaoxian and Sun, Penglei and Ren, Yi and Zheng, Yu and Zhang, Yue},
  journal={arXiv preprint arXiv:2301.11564},
  year={2023}
}

[IROS2023] Task-Oriented Grasp Prediction with Visual-Language Inputs, [Paper].

Keywords: 2D grasp; cluttered scene; image and language input; two stage method, from object grouding to affordance grounding.

@article{tang2023task,
  title={Task-Oriented Grasp Prediction with Visual-Language Inputs},
  author={Tang, Chao and Huang, Dehao and Meng, Lingxiao and Liu, Weiyu and Zhang, Hong},
  journal={arXiv preprint arXiv:2302.14355},
  year={2023}
}

[IROS2022] Learning 6-DoF Task-oriented Grasp Detection via Implicit Estimation and Visual Affordance, [Paper].

Keywords: task-oriented grasping; single object; point cloud input; 6D grasp; a grasping affordance detection module to generate grasps corresponding to affordance label, and a evaluation network to recognize success and faliure, a visual affordance network outputs affordance map to get fine grasp candidates.

@INPROCEEDINGS{9981900,
  author={Chen, Wenkai and Liang, Hongzhuo and Chen, Zhaopeng and Sun, Fuchun and Zhang, Jianwei},
  booktitle={2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, 
  title={Learning 6-DoF Task-oriented Grasp Detection via Implicit Estimation and Visual Affordance}, 
  year={2022},
  volume={},
  number={},
  pages={762-769},
  doi={10.1109/IROS47612.2022.9981900}
}

[ICRA2023] CoGrasp: 6-DoF Grasp Generation for Human-Robot Collaboration, [Paper].

Keywords: 6D grasp; RGB-D input, instance segmentation and get partial object point cloud, then shape completion, robot grasps and human grasps are generated based on completed object point cloud, finally a pruning network is applied to select the proper robot grasp compatible for the co-grasping.

@INPROCEEDINGS{10160623,
  author={Keshari, Abhinav K. and Ren, Hanwen and Qureshi, Ahmed H.},
  booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={CoGrasp: 6-DoF Grasp Generation for Human-Robot Collaboration}, 
  year={2023},
  volume={},
  number={},
  pages={9829-9836},
  doi={10.1109/ICRA48891.2023.10160623}}

[RA-L2022] REGRAD: A Large-Scale Relational Grasp Dataset for Safe and Object-Specific Robotic Grasping in Clutter, [Paper], [Code&&Dataset].

Keywords: release a dataset; cluttered scene; auto-generated in simulation; learn relationships among objetcs and grasps.

@ARTICLE{9681218,
  author={Zhang, Hanbo and Yang, Deyu and Wang, Han and Zhao, Binglei and Lan, Xuguang and Ding, Jishiyu and Zheng, Nanning},
  journal={IEEE Robotics and Automation Letters}, 
  title={REGRAD: A Large-Scale Relational Grasp Dataset for Safe and Object-Specific Robotic Grasping in Clutter}, 
  year={2022},
  volume={7},
  number={2},
  pages={2929-2936},
  doi={10.1109/LRA.2022.3142401}
}

[RA-L2022] Few-Shot Instance Grasping of Novel Objects in Clutter, [Paper], [Code].

Keywords: cluttered scene; grasp a specific object; meta-learning framework; 2D grasp.

@ARTICLE{9773996,
  author={Guo, Weikun and Li, Wei and Hu, Ziye and Gan, Zhongxue},
  journal={IEEE Robotics and Automation Letters}, 
  title={Few-Shot Instance Grasping of Novel Objects in Clutter}, 
  year={2022},
  volume={7},
  number={3},
  pages={6566-6573},
  doi={10.1109/LRA.2022.3174648}
}

[ICRA2022] Learning Object Relations with Graph Neural Networks for Target-Driven Grasping in Dense Clutter, [Project], [Paper].

Keywords: target-driven grasp; cluttered scene; 6-D grasp; sampling based grasp generation; shape completion-assisted grasp sampling; formulate grasp graph, nodes representing object, edges indicating spatial relations between the objects; train on synthetic dataset; input scene RGB and query image.

@INPROCEEDINGS{9811601,
  author={Lou, Xibai and Yang, Yang and Choi, Changhyun},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Learning Object Relations with Graph Neural Networks for Target-Driven Grasping in Dense Clutter}, 
  year={2022},
  pages={742-748},
  doi={10.1109/ICRA46639.2022.9811601}
}

[ICRA2022] Interactive Robotic Grasping with Attribute-Guided Disambiguation, [Project], [Paper].

Keywords: cluttered scene; input scene RGBD and query language; 6D grasp; vision-and-language grounding module predicts target scores and attribute scores; attribute-guided partially observable Markov decision process for language disambiguation(ask questions).

@INPROCEEDINGS{9812360,
  author={Yang, Yang and Lou, Xibai and Choi, Changhyun},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Interactive Robotic Grasping with Attribute-Guided Disambiguation}, 
  year={2022},
  pages={8914-8920},
  doi={10.1109/ICRA46639.2022.9812360}
}

[ICRA2022] I Know What You Draw: Learning Grasp Detection Conditioned on a Few Freehand Sketches, [Project], [Paper].

Keywords: 2D planar grasp; cluttered scene; target grasps by understanding freehand sketches; RGB image and graph-represented sketch input.

@INPROCEEDINGS{9812372,
  author={Lin, Haitao and Cheang, Chilam and Fu, Yanwei and Xue, Xiangyang},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={I Know What You Draw: Learning Grasp Detection Conditioned on a Few Freehand Sketches}, 
  year={2022},
  pages={8417-8423},
  doi={10.1109/ICRA46639.2022.9812372}
}

[ICRA2022] Learning 6-DoF Object Poses to Grasp Category-level Objects by Language Instructions, [Project], [Paper], [Code].

Keywords: grasp target object based on language description; two-stage method; 2D visual grounding, category-level object pose estimation; RGBD and language description input.

@INPROCEEDINGS{9811367,
  author={Cheang, Chilam and Lin, Haitao and Fu, Yanwei and Xue, Xiangyang},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={Learning 6-DoF Object Poses to Grasp Category-Level Objects by Language Instructions}, 
  year={2022},
  pages={8476-8482},
  doi={10.1109/ICRA46639.2022.9811367}
}

[ICRA2022] CaTGrasp: Learning Category-Level Task-Relevant Grasping in Clutter from Simulation, [Project], [Paper], [Code].

Keywords: 6D category-level task-oriented grasp; cluttered scene in simulation; self-supervised in simulation.

@INPROCEEDINGS{9811568,
  author={Wen, Bowen and Lian, Wenzhao and Bekris, Kostas and Schaal, Stefan},
  booktitle={2022 International Conference on Robotics and Automation (ICRA)}, 
  title={CaTGrasp: Learning Category-Level Task-Relevant Grasping in Clutter from Simulation}, 
  year={2022},
  pages={6401-6408},
  doi={10.1109/ICRA46639.2022.9811568}
}

[RA-L2022] GATER: Learning Grasp-Action-Target Embeddings and Relations for Task-Specific Grasping, [Paper].

Keywords: 2D planar grasp; task-oriented grasp; self-made task-oriented grasp dataset; grasp-action-target relationship.

@ARTICLE{9629256,
  author={Sun, Ming and Gao, Yue},
  journal={IEEE Robotics and Automation Letters}, 
  title={GATER: Learning Grasp-Action-Target Embeddings and Relations for Task-Specific Grasping}, 
  year={2022},
  volume={7},
  number={1},
  pages={618-625},
  doi={10.1109/LRA.2021.3131378}}{IEEE}
}

[ICRA2021] A Joint Network for Grasp Detection Conditioned on Natural Language Commands, [Paper].

Keywords: 2D planar grasp; structured language command and rgb input; VMRD dataset; target-sepcific grasp.

@inproceedings{chen2021joint,
  title={A joint network for grasp detection conditioned on natural language commands},
  author={Chen, Yiye and Xu, Ruinian and Lin, Yunzhi and Vela, Patricio A},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={4576--4582},
  year={2021},
  organization={IEEE}
}

[ICRA2021] End-to-end Trainable Deep Neural Network for Robotic Grasp Detection and Semantic Segmentation from RGB, [Paper].

Keywords: 2D grasp; joint grasp detection and semantic segmentation; OCID dataset.

@INPROCEEDINGS{9561398,
  author={Ainetter, Stefan and Fraundorfer, Friedrich},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={End-to-end Trainable Deep Neural Network for Robotic Grasp Detection and Semantic Segmentation from RGB}, 
  year={2021},
  pages={13452-13458},
  doi={10.1109/ICRA48506.2021.9561398}
}

[RSS2021] INVIGORATE: Interactive Visual Grounding and Grasping in Clutter, [Paper].

Keywords: input language expressions and RGB; cluttered scene; train separate neural networks for object detection, for visual grounding, for question generation, and for object blocking relationships detection and grasping.

@INPROCEEDINGS{ZhangLu-RSS-21, 
    AUTHOR    = {Hanbo Zhang AND Yunfan Lu AND Cunjun Yu AND David Hsu AND Xuguang Lan AND Nanning Zheng}, 
    TITLE     = {{INVIGORATE: Interactive Visual Grounding and Grasping in Clutter}}, 
    BOOKTITLE = {Proceedings of Robotics: Science and Systems}, 
    YEAR      = {2021}, 
    ADDRESS   = {Virtual}, 
    MONTH     = {July}, 
    DOI       = {10.15607/RSS.2021.XVII.020} 
} 

[ICRA2020] 6-DOF Grasping for Target-driven Object Manipulation in Clutter, [Paper].

Keywords: 6D grasp; cluttered scene; grasp target object; RGB-D input; sampling based grasp.

@INPROCEEDINGS{9197318,
  author={Murali, Adithyavairavan and Mousavian, Arsalan and Eppner, Clemens and Paxton, Chris and Fox, Dieter},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={6-DOF Grasping for Target-driven Object Manipulation in Clutter}, 
  year={2020},
  pages={6232-6238},
  doi={10.1109/ICRA40945.2020.9197318}
}

[CoRL2020] Same Object, Different Grasps: Data and SemanticKnowledge for Task-Oriented Grasping, [Project], [Paper], [Code], [Dataset].

Keywords: 6D task-oriented grasp; single object; real-world data; object point cloud and goal task input; PointNet++ backbone for point cloud, Graph Convolutional Network for object and task semantic knowledge.

@inproceedings{murali2020taskgrasp,
  title={Same Object, Different Grasps: Data and Semantic Knowledge for Task-Oriented Grasping},
  author={Murali, Adithyavairavan and Liu, Weiyu and Marino, Kenneth and Chernova, Sonia and Gupta, Abhinav},
  booktitle={Conference on Robot Learning},
  year={2020}
}

[IJRR2020] Learning Task-Oriented Grasping for Tool Manipulation from Simulated Self-Supervision, [Project], [Paper].

Keywords: task-oriented grasping; manipulation policy; self-supervised in simulation; single object; planar grasp; depth image input; 2 tasks.

@article{fang2020learning,
  title={Learning task-oriented grasping for tool manipulation from simulated self-supervision},
  author={Fang, Kuan and Zhu, Yuke and Garg, Animesh and Kurenkov, Andrey and Mehta, Viraj and Fei-Fei, Li and Savarese, Silvio},
  journal={The International Journal of Robotics Research},
  volume={39},
  number={2-3},
  pages={202--216},
  year={2020},
  publisher={SAGE Publications Sage UK: London, England}
}

[RSS2020] Robot Object Retrieval with Contextual Natural Language Queries, [Paper], [Code].

Keywords: retrieval objects based on their usage; localize target object in RGB and then grasp it.

@INPROCEEDINGS{Nguyen-RSS-20,
    AUTHOR    = {Thao Nguyen AND Nakul Gopalan AND Roma Patel AND Matthew Corsaro AND Ellie Pavlick AND Stefanie Tellex},
    TITLE     = {{Robot Object Retrieval with Contextual Natural Language Queries}},
    BOOKTITLE = {Proceedings of Robotics: Science and Systems},
    YEAR      = {2020},
    ADDRESS   = {Corvalis, Oregon, USA},
    MONTH     = {July},
    DOI       = {10.15607/RSS.2020.XVI.080}
}

[ICRA2020] CAGE: Context-Aware Grasping Engine, [Paper], [Code].

Keywords: single object; semantic context including task, object state, object material, object affordance; semantic context and sampled grasps input; output ranking of grasps ordered by their suitability to the context.

@inproceedings{liu2020cage,
  title={Cage: Context-aware grasping engine},
  author={Liu, Weiyu and Daruna, Angel and Chernova, Sonia},
  booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={2550--2556},
  year={2020},
  organization={IEEE}
}

[IROS2019] Task-oriented Grasping in Object Stacking Scenes with CRF-based Semantic Model, [Paper].

Keywords: task-oriented grasping; cluttered scene in simulation; planar grasp; 11 tasks, 10 object categories, 100 objects; depth image input.

@inproceedings{yang2019task,
  title={Task-oriented grasping in object stacking scenes with crf-based semantic model},
  author={Yang, Chenjie and Lan, Xuguang and Zhang, Hanbo and Zheng, Nanning},
  booktitle={2019 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
  pages={6427--6434},
  year={2019},
  organization={IEEE}
}

[ICRA2018] Interactively Picking Real-World Objects with Unconstrained Spoken Language Instructions, [Paper].

Keywords: language-guided robotic grasping; resolve instruction ambiguity through dialogue; localize object by detection first, then identify the target object; vacuum gripper.

@INPROCEEDINGS{8460699,
  author={Hatori, Jun and Kikuchi, Yuta and Kobayashi, Sosuke and Takahashi, Kuniyuki and Tsuboi, Yuta and Unno, Yuya and Ko, Wilson and Tan, Jethro},
  booktitle={2018 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Interactively Picking Real-World Objects with Unconstrained Spoken Language Instructions}, 
  year={2018},
  pages={3774-3781},
  doi={10.1109/ICRA.2018.8460699}
}

[ICRA2018] AffordanceNet: An End-to-End Deep Learning Approach for Object Affordance Detection, [Paper], [Code].

Keywords: simultaneous object detection and affordance detection; RGB input.

@inproceedings{do2018affordancenet,
  title={Affordancenet: An end-to-end deep learning approach for object affordance detection},
  author={Do, Thanh-Toan and Nguyen, Anh and Reid, Ian},
  booktitle={2018 IEEE international conference on robotics and automation (ICRA)},
  pages={5882--5889},
  year={2018},
  organization={IEEE}
}

[Humanoids2017] Affordance Detection for Task-Specific Grasping Using Deep Learning, [Paper].

Keywords: single object point cloud and task name input; output affordance detection, no grasp; 5 tasks, 10 object classes; generalize to novel object class.

@inproceedings{kokic2017affordance,
  title={Affordance detection for task-specific grasping using deep learning},
  author={Kokic, Mia and Stork, Johannes A and Haustein, Joshua A and Kragic, Danica},
  booktitle={2017 IEEE-RAS 17th International Conference on Humanoid Robotics (Humanoids)},
  pages={91--98},
  year={2017},
  organization={IEEE}
}

3.4 Dynamic Grasping

[IROS2023] Flexible Handover with Real-Time Robust Dynamic Grasp Trajectory Generation, [Paper].

Keywords: flexible human-to-robot handover; generate object grasp trajectory based on grasp detection method GSNet and a lightweight transformer; future grasp prediction algorithm.

@INPROCEEDINGS{10341777,
  author={Zhang, Gu and Fang, Hao-Shu and Fang, Hongjie and Lu, Cewu},
  booktitle={2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, 
  title={Flexible Handover with Real-Time Robust Dynamic Grasp Trajectory Generation}, 
  year={2023},
  volume={},
  number={},
  pages={3192-3199},
  doi={10.1109/IROS55552.2023.10341777}}

[CVPR2023] Target referenced Reactive Grasping for Dynamic Objects, [Project], [Paper], [Code].

Keywords: reactive grasping, grasp dynalmic moving objects; 6D grasp; cluttered scenes; given grasps of first frame, tracking through generated grasp space; two-stage methods, first discover grasp correspndance between frames, then refine based on history information.

Motivation: current methods mainly focus on temporal smoothness but few consider their semantic consistency, can not guarantee the tracked grasps fall on the same part of same object. This paper propose a target-referenced setting to achieve temporally smooth and smeantically consistent reactive grasping in clutter given a targeted grasp.

@InProceedings{Liu_2023_CVPR,
    author    = {Liu, Jirong and Zhang, Ruo and Fang, Hao-Shu and Gou, Minghao and Fang, Hongjie and Wang, Chenxi and Xu, Sheng and Yan, Hengxu and Lu, Cewu},
    title     = {Target-Referenced Reactive Grasping for Dynamic Objects},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2023},
    pages     = {8824-8833}
}

[T-RO2023] AnyGrasp: Robust and Efficient Grasp Perception in Spatial and Temporal Domains, [Project], [Paper], [Demo_Code].

Keywords: 6D grasp; dynamic grasping;

@ARTICLE{10167687,
  author={Fang, Hao-Shu and Wang, Chenxi and Fang, Hongjie and Gou, Minghao and Liu, Jirong and Yan, Hengxu and Liu, Wenhai and Xie, Yichen and Lu, Cewu},
  journal={IEEE Transactions on Robotics}, 
  title={AnyGrasp: Robust and Efficient Grasp Perception in Spatial and Temporal Domains}, 
  year={2023},
  volume={39},
  number={5},
  pages={3929-3945},
  doi={10.1109/TRO.2023.3281153}}

4. Research Groups