@techreport{xu-lsr-fare-03, number = {draft-xu-lsr-fare-03}, type = {Internet-Draft}, institution = {Internet Engineering Task Force}, publisher = {Internet Engineering Task Force}, note = {Work in Progress}, url = {https://datatracker.ietf.org/doc/draft-xu-lsr-fare/03/}, author = {Xiaohu Xu and Shraddha Hegde and Zongying He and Junjie Wang and Hongyi Huang and Qingliang Zhang and Hang Wu and Yadong Liu and Yinben Xia and Peilong Wang}, title = {{Fully Adaptive Routing Ethernet using LSR}}, pagetotal = 10, year = 2024, month = sep, day = 1, abstract = {Large language models (LLMs) like ChatGPT have become increasingly popular in recent years due to their impressive performance in various natural language processing tasks. These models are built by training deep neural networks on massive amounts of text data, often consisting of billions or even trillions of parameters. However, the training process for these models can be extremely resource- intensive, requiring the deployment of thousands or even tens of thousands of GPUs in a single AI training cluster. Therefore, three- stage or even five-stage CLOS networks are commonly adopted for AI networks. The non-blocking nature of the network become increasingly critical for large-scale AI models. Therefore, adaptive routing is necessary to dynamically distribute traffic to the same destination over multiple equal-cost paths, based on network capacity and even congestion information along those paths.}, }