@techreport{yao-tsvwg-cco-requirement-and-analysis-02, number = {draft-yao-tsvwg-cco-requirement-and-analysis-02}, type = {Internet-Draft}, institution = {Internet Engineering Task Force}, publisher = {Internet Engineering Task Force}, note = {Work in Progress}, url = {https://datatracker.ietf.org/doc/draft-yao-tsvwg-cco-requirement-and-analysis/02/}, author = {Kehan Yao and Xu Shiping and Liu Chang and Yizhou Li and Hongyi Huang and Weifeng Wang and Dirk KUTSCHER}, title = {{Collective Communication Optimizations: Requirement and Analysis}}, pagetotal = 15, year = 2024, month = jul, day = 8, abstract = {Gernerative AI applications depend on large scale parallel computing clusters for model training and inference. Existing implementations of collective communication in parallel computing is built on top of RDMA, the most adoptable AI transport protocol. However, One-to- Many, Many-to-One, and Many-to-Many collective operations all depend on point-to-point transport semantics of RDMA, which inevitably introduces more bandwidth occupancy and transmission overhead. Emerging approaches for collective communication optimization focus on network-assisted collective acceleration and can work compatibly with RDMA. This document analyzes different technical schemes for network-assisted collective acceleration based on RDMA, and presents the gap between these work and current IETF standards, notably iWARP. Requirements for designing new standards are proposed accordingly.}, }