diff --git a/end-to-end-use-cases/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb b/end-to-end-use-cases/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb index 2cf5d38d3..060852573 100644 --- a/end-to-end-use-cases/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb +++ b/end-to-end-use-cases/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb @@ -20,7 +20,7 @@ "- Prompt `Llama-3.2-1B-Instruct` model to process it into a text file\n", "- Re-write this into a podcast transcript in next notebook.\n", "\n", - "In this notebook, we will upload a PDF and save it into a `.txt` file using the `PyPDF2` library, later we will process chunks from the text file using our featherlight model." + "In this notebook, we will upload a PDF and save it into a `.txt` file using the `pypdf` library, later we will process chunks from the text file using our featherlight model." ] }, { @@ -38,7 +38,7 @@ "metadata": {}, "outputs": [], "source": [ - "#!pip install PyPDF2\n", + "#!pip install pypdf\n", "#!pip install rich ipywidgets" ] }, @@ -70,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "import PyPDF2\n", + "import pypdf\n", "from typing import Optional\n", "import os\n", "import torch\n", @@ -132,7 +132,7 @@ " try:\n", " with open(file_path, 'rb') as file:\n", " # Create PDF reader object\n", - " pdf_reader = PyPDF2.PdfReader(file)\n", + " pdf_reader = pypdf.PdfReader(file)\n", " \n", " # Get total number of pages\n", " num_pages = len(pdf_reader.pages)\n", @@ -163,7 +163,7 @@ " print(f\"\\nExtraction complete! Total characters: {len(final_text)}\")\n", " return final_text\n", " \n", - " except PyPDF2.PdfReadError:\n", + " except pypdf.PdfReadError:\n", " print(\"Error: Invalid or corrupted PDF file\")\n", " return None\n", " except Exception as e:\n", @@ -193,7 +193,7 @@ " \n", " try:\n", " with open(file_path, 'rb') as file:\n", - " pdf_reader = PyPDF2.PdfReader(file)\n", + " pdf_reader = pypdf.PdfReader(file)\n", " metadata = {\n", " 'num_pages': len(pdf_reader.pages),\n", " 'metadata': pdf_reader.metadata\n", @@ -271,7 +271,7 @@ "4University of Technology Sydney5Peking University6The University of Sydney\n", "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n", "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n", - "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati\n", + "Abstract \u2014In the era of Large Language Models (LLMs), Knowledge Distillati\n", "--------------------------------------------------\n", "\n", "Total characters extracted: 100016\n", @@ -549,14 +549,14 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "1 A Survey on Knowledge Distillation of Large Language Models Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1, Can Xu5, Dacheng Tao6, Tianyi Zhou2 1The University of Hong Kong2University of Maryland3Microsoft 4University of Technology Sydney5Peking University6The University of Sydney {shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu ckcheng@cs.hku.hk jl0725@connect.hku.hk Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati...\n", + "1 A Survey on Knowledge Distillation of Large Language Models Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1, Can Xu5, Dacheng Tao6, Tianyi Zhou2 1The University of Hong Kong2University of Maryland3Microsoft 4University of Technology Sydney5Peking University6The University of Sydney {shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu ckcheng@cs.hku.hk jl0725@connect.hku.hk Abstract \u2014In the era of Large Language Models (LLMs), Knowledge Distillati...\n", "\n", "PROCESSED TEXT:\n", "===============\n", "\n", "Knowledge Distillation is a methodology that transfers advanced capabilities from leading proprietary Large Language Models (LLMs) to their open-source counterparts, such as LLaMA and Mistral. This paper presents a comprehensive survey of KD's role in imparting advanced knowledge.\n", "\n", - "Abstract —In the era of Large Language Models, Knowledge Distillation emerges as a pivotal methodology for transferring advanced capabilities from proprietary LLMs to open-source counterparts, facilit...\n", + "Abstract \u2014In the era of Large Language Models, Knowledge Distillation emerges as a pivotal methodology for transferring advanced capabilities from proprietary LLMs to open-source counterparts, facilit...\n", "==========================================================================================\n", "\n" ] @@ -573,7 +573,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "advanced knowledge to smaller models and its utility in model compression and self- improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization – providing a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how DA emerges as a p...\n", + "advanced knowledge to smaller models and its utility in model compression and self- improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization \u2013 providing a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how DA emerges as a p...\n", "\n", "PROCESSED TEXT:\n", "xamined through a meticulous survey that delves into the foundational pillars of algorithm, skill, and verticalization, which form the backbone of knowledge distillation and deep learning models. The survey provides a comprehensive examination of key mechanisms within the knowledge distillation framework, specifically focusing on the enhancement of cognitive abilities and their practical implications across various fields, with a particular emphasis on the interplay between data augmentation (DA...\n", @@ -593,7 +593,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "distillation and proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs. Index Terms —...\n", + "distillation and proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs. Index Terms \u2014...\n", "\n", "PROCESSED TEXT:\n", "en-source LLMs, this survey highlights the potential for more accessible, efficient, and powerful AI solutions.\n", @@ -811,7 +811,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "SupervisedFine-tuningX,Y preferenceRankOptimizationy,1y,2y3y1y2y3≻≻rank…… DataCuration X,YrawdatasynthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,YdemonstrationsexpandFeature featureinput,outputextractSec.4Sec.5 Sec.3.1Sec.3.2 Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated as ‘Sec.’ in this figure. RM S(·)denotes the student reward model. the growing demand for more accessib...\n", + "SupervisedFine-tuningX,Y preferenceRankOptimizationy,1y,2y3y1y2y3\u227b\u227brank\u2026\u2026 DataCuration X,YrawdatasynthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,YdemonstrationsexpandFeature featureinput,outputextractSec.4Sec.5 Sec.3.1Sec.3.2 Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that \u2018Section\u2019 is abbreviated as \u2018Sec.\u2019 in this figure. RM S(\u00b7)denotes the student reward model. the growing demand for more accessib...\n", "\n", "PROCESSED TEXT:\n", "synthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,Y demonstrationsexpandFeature featureinput,outputextractSec.4Sec.5 Sec.3.1Sec.3.2 Fig. 2: An overview of this survey on knowledge distillation of large language models...\n", @@ -831,7 +831,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "gaps in current techniques and proposing direc- tions for future research. Survey Organization. The remainder of this survey is orga- nized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm ofLLMs. Following this intro- duction, §2 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augment...\n", + "gaps in current techniques and proposing direc- tions for future research. Survey Organization. The remainder of this survey is orga- nized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm ofLLMs. Following this intro- duction, \u00a72 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augment...\n", "\n", "PROCESSED TEXT:\n", "es emerging, but there is still much to be learned from the era of Large Language Models (LLMs). In this section, we provide a foundational overview of knowledge distillation, highlighting the role of data augmentation (DA) in this context.\n", @@ -853,10 +853,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "includes discus- sions on natural language understanding (NLU), genera- tion (NLG), information retrieval, recommendation systems, and the evaluation of text generation. In §5, we ventureinto domain-specific vertical distillation, showcasing how knowledge distillation techniques are applied within spe- cialized fields such as law, healthcare, finance, and science, illustrating the practical implications and transformative impact of these approaches. The survey suggests open problems in §6, ident...\n", + "includes discus- sions on natural language understanding (NLU), genera- tion (NLG), information retrieval, recommendation systems, and the evaluation of text generation. In \u00a75, we ventureinto domain-specific vertical distillation, showcasing how knowledge distillation techniques are applied within spe- cialized fields such as law, healthcare, finance, and science, illustrating the practical implications and transformative impact of these approaches. The survey suggests open problems in \u00a76, ident...\n", "\n", "PROCESSED TEXT:\n", - "mmendation systems, and the evaluation of text generation. In §5, we delve into domain-specific vertical distillation, demonstrating how knowledge distillation techniques are applied in specialized fields such as law, healthcare, finance, and science, highlighting their practical implications and transformative impact. The survey reveals open problems in §6, highlighting current challenges and gaps in knowledge distillation research that present opportunities for future work....\n", + "mmendation systems, and the evaluation of text generation. In \u00a75, we delve into domain-specific vertical distillation, demonstrating how knowledge distillation techniques are applied in specialized fields such as law, healthcare, finance, and science, highlighting their practical implications and transformative impact. The survey reveals open problems in \u00a76, highlighting current challenges and gaps in knowledge distillation research that present opportunities for future work....\n", "==========================================================================================\n", "\n" ] @@ -956,7 +956,7 @@ "(Lee et al., 2023a), Zephy (Tunstall et al., 2023), UltraFeedback (Cui et al., 2023a), ValueCAI (Bai et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a) AgentTool UsingToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2...\n", "\n", "PROCESSED TEXT:\n", - "i et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a), AgentToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (Shen et al., 2024), PlanningFireAct (Chen et al., 2023b), Agent...\n", + "i et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a), AgentToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), \u03b1-UMi (Shen et al., 2024), PlanningFireAct (Chen et al., 2023b), Agent...\n", "==========================================================================================\n", "\n" ] @@ -1037,7 +1037,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "replicate the output behavior of the teacher model or reduce the model size , the current focus in LLM-based knowledge distillation is to extract and transfer the rich, nuanced understanding that these models have developed. The key to this modern approach lies in heuristic and carefully designed prompts, which are used to elicit specific knowledge (Ding et al., 2023b) or capabilities (Chaudhary, 2023) from the LLMs. These prompts are crafted to tap into the LLM’s understanding and capabilities ...\n", + "replicate the output behavior of the teacher model or reduce the model size , the current focus in LLM-based knowledge distillation is to extract and transfer the rich, nuanced understanding that these models have developed. The key to this modern approach lies in heuristic and carefully designed prompts, which are used to elicit specific knowledge (Ding et al., 2023b) or capabilities (Chaudhary, 2023) from the LLMs. These prompts are crafted to tap into the LLM\u2019s understanding and capabilities ...\n", "\n", "PROCESSED TEXT:\n", "size, the current focus in llm-based knowledge distillation is to extract and transfer the rich, nuanced understanding that these models have developed the key to this modern approach lies in carefully designed prompts that elicit specific knowledge or capabilities from the llms, tapping into their understanding and capabilities in various domains ranging from natural language understanding to more complex cognitive tasks like reasoning and problem-solving...\n", @@ -1097,7 +1097,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "as a potent mechanism for bridging the knowl- edge and capability gap between proprietary and open- source models. Through DA, LLMs are prompted to create targeted, high-quality datasets that are not merely larger in volume but are also rich in diversity and specificity. This approach enables the distillation process to be more effec- tive, ensuring that the distilled models not only replicate the teacher model’s output behavior but also embody its deep-seated understanding and cognitive strateg...\n", + "as a potent mechanism for bridging the knowl- edge and capability gap between proprietary and open- source models. Through DA, LLMs are prompted to create targeted, high-quality datasets that are not merely larger in volume but are also rich in diversity and specificity. This approach enables the distillation process to be more effec- tive, ensuring that the distilled models not only replicate the teacher model\u2019s output behavior but also embody its deep-seated understanding and cognitive strateg...\n", "\n", "PROCESSED TEXT:\n", "ource models, through Deep Learning Models (LLMs) are prompted to create targeted, high-quality datasets that are not merely larger in volume but also rich in diversity and specificity. This approach enables the distillation process to be more effective, ensuring that the distilled models replicate the teacher model's output behavior and embody its deep-seated understanding and cognitive strategies. The significance and necessity of Data Augmentation (DA) for achieving Knowledge Domains (KD) in ...\n", @@ -1137,7 +1137,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "distillation. KD Algorithms. This segment focuses on the technical foundations and methodologies of knowledge distillation. It includes an in-depth exploration of the processes involved in constructing knowledge from teacher models (e.g., pro- prietary LLMs) and integrating this knowledge into student models (e.g., open-source LLMs). Under the umbrella of ‘knowledge ’, we delve into strategies such as labeling (Hsieh et al., 2023), expansion (Taori et al., 2023), curation (Gu- nasekar et al., 20...\n", + "distillation. KD Algorithms. This segment focuses on the technical foundations and methodologies of knowledge distillation. It includes an in-depth exploration of the processes involved in constructing knowledge from teacher models (e.g., pro- prietary LLMs) and integrating this knowledge into student models (e.g., open-source LLMs). Under the umbrella of \u2018knowledge \u2019, we delve into strategies such as labeling (Hsieh et al., 2023), expansion (Taori et al., 2023), curation (Gu- nasekar et al., 20...\n", "\n", "PROCESSED TEXT:\n", "undations and methodologies of knowledge distillation. It includes an in-depth exploration of processes involved in constructing knowledge from teacher models (e.g., proprietary LLMs) and integrating this knowledge into student models (e.g., open-source LLMs). Under the umbrella of 'knowledge', we delve into strategies such as labeling, expansion, curation, feature understanding, and feedback mechanisms. The exploration seeks to uncover the various ways in which knowledge can be identified, expa...\n", @@ -1177,7 +1177,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "lan- guage generation (NLG), information retrieval, recommen- dation systems, text generation evaluation, and code gen- eration. Finally, the survey addresses multi-modality (Liu et al., 2023e; Zhao et al., 2023b), exploring how KD enhances LLMs’ ability to interpret and integrate multiple forms of input, enriching their utility and applicability across various contexts. Verticalization Distillation. This section assesses the ap- plication of KD across diverse vertical domains, offering insights...\n", + "lan- guage generation (NLG), information retrieval, recommen- dation systems, text generation evaluation, and code gen- eration. Finally, the survey addresses multi-modality (Liu et al., 2023e; Zhao et al., 2023b), exploring how KD enhances LLMs\u2019 ability to interpret and integrate multiple forms of input, enriching their utility and applicability across various contexts. Verticalization Distillation. This section assesses the ap- plication of KD across diverse vertical domains, offering insights...\n", "\n", "PROCESSED TEXT:\n", "tion, and Code Generation**\n", @@ -1250,7 +1250,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "seen in Figure 2. I. Target Skill or Domain Steering Teacher LLM. The first stage involves directing the teacher LLM towards a specific target skill or domain. This is achieved through care- fully crafted instructions or templates that guide the LLM’s focus. These instructions are designed to elicit responses that demonstrate the LLM’s proficiency in a particular area, be it a specialized domain like healthcare or law, or a skill such as reasoning or language understanding. The objective here is...\n", + "seen in Figure 2. I. Target Skill or Domain Steering Teacher LLM. The first stage involves directing the teacher LLM towards a specific target skill or domain. This is achieved through care- fully crafted instructions or templates that guide the LLM\u2019s focus. These instructions are designed to elicit responses that demonstrate the LLM\u2019s proficiency in a particular area, be it a specialized domain like healthcare or law, or a skill such as reasoning or language understanding. The objective here is...\n", "\n", "PROCESSED TEXT:\n", "ards a specific target skill or domain This is achieved through carefully crafted instructions or templates that guide the LLM's focus These instructions are designed to elicit responses that demonstrate the LLM's proficiency in a particular area be it a specialized domain like healthcare or law or a skill such as reasoning or language understanding The objective here is to utilize the teacher LLM's extensive training and nuanced capabilities to generate outputs that are rich in the specific kno...\n", @@ -1290,7 +1290,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "Specific Learn- ing Objective. The final stage involves the utilization of the generated knowledge examples to train the student model. This training is guided by a loss function that aligns with the learning objectives. The loss function quantifies the student model’s performance in replicating or adapting the knowledge from the teacher model. By minimizing this loss, the student model learns to emulate the target skills or domain knowledge of the teacher, thereby acquiring similar capabilities...\n", + "Specific Learn- ing Objective. The final stage involves the utilization of the generated knowledge examples to train the student model. This training is guided by a loss function that aligns with the learning objectives. The loss function quantifies the student model\u2019s performance in replicating or adapting the knowledge from the teacher model. By minimizing this loss, the student model learns to emulate the target skills or domain knowledge of the teacher, thereby acquiring similar capabilities...\n", "\n", "PROCESSED TEXT:\n", "knowledge examples to train the student model. This training is guided by a loss function that aligns with the learning objectives. The loss function quantifies the student model's performance in replicating or adapting the knowledge from the teacher model. By minimizing this loss, the student model learns to emulate the target skills or domain knowledge of the teacher, thereby acquiring similar capabilities....\n", @@ -1310,10 +1310,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "domain to steer the LLM and elicit knowledge, s∼ S denotes an example of the seed knowledge, upon which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM’s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters θT. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L=X ILI(D(kd) I;θS), (2) whereP Idenotes there could be multiple task...\n", + "domain to steer the LLM and elicit knowledge, s\u223c S denotes an example of the seed knowledge, upon which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM\u2019s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters \u03b8T. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L=X ILI(D(kd) I;\u03b8S), (2) whereP Idenotes there could be multiple task...\n", "\n", "PROCESSED TEXT:\n", - "which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM’s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters θT. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L=X ILI(D(kd) I;θS), (2) where P Idenotes there could be multiple tasks or skills being distilled into one student model, LI(·;·)stands for a specific learning objecti...\n", + "which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM\u2019s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters \u03b8T. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L=X ILI(D(kd) I;\u03b8S), (2) where P Idenotes there could be multiple tasks or skills being distilled into one student model, LI(\u00b7;\u00b7)stands for a specific learning objecti...\n", "==========================================================================================\n", "\n" ] @@ -1330,7 +1330,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "it is categorized into two principal steps: ‘Knowledge,’ focusing on eliciting knowledge from teacher LLMs (Eq.1), and ‘Distillation,’ centered on injecting this knowledge into student models (Eq.2). We will elaborate on these two processes in the subsequent sections. 3.1 Knowledge This section focuses on the approaches to elicit knowledge from teacher LLMs. According to the manners to acquire knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowled...\n", + "it is categorized into two principal steps: \u2018Knowledge,\u2019 focusing on eliciting knowledge from teacher LLMs (Eq.1), and \u2018Distillation,\u2019 centered on injecting this knowledge into student models (Eq.2). We will elaborate on these two processes in the subsequent sections. 3.1 Knowledge This section focuses on the approaches to elicit knowledge from teacher LLMs. According to the manners to acquire knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowled...\n", "\n", "PROCESSED TEXT:\n", "...\n", @@ -1350,10 +1350,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "dataset and feeding it into LLMs to obtain the desired generations. Moreover, the generation of yis controllable through the predefined Iandc. This process can be formulated as follows: D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}. (3) Input xcould be sourced from existing NLP task datasets, which serve as typical reservoirs for distillation efforts. Numerous works have sought to harness the capa- bilities of powerful LLMs as teachers for annotating dataset samples across a range of tasks. For instance, ef...\n", + "dataset and feeding it into LLMs to obtain the desired generations. Moreover, the generation of yis controllable through the predefined Iandc. This process can be formulated as follows: D(lab)={x, y|x\u223c X, y\u223cpT(y|I\u2295c\u2295x)}. (3) Input xcould be sourced from existing NLP task datasets, which serve as typical reservoirs for distillation efforts. Numerous works have sought to harness the capa- bilities of powerful LLMs as teachers for annotating dataset samples across a range of tasks. For instance, ef...\n", "\n", "PROCESSED TEXT:\n", - "is process can be formulated as follows: D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}....\n", + "is process can be formulated as follows: D(lab)={x, y|x\u223c X, y\u223cpT(y|I\u2295c\u2295x)}....\n", "==========================================================================================\n", "\n" ] @@ -1390,10 +1390,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "powerful LLMs, like ShareGPT. Additionally, Xu et al. (2023b) and Anand et al. (2023) label the real questions sampled from forums like Quora and Stack Overflow. Moreover, the process of labeling could be guided by instructions Ior demonstrations c. A commonly used in- struction type for guiding labeling is chain-of-thought (CoT) prompt (Hsieh et al., 2023; Fu et al., 2023; Magister et al., 2023). Mukherjee et al. (2023) add multiple system messages (e.g. “You must generate a detailed and long a...\n", + "powerful LLMs, like ShareGPT. Additionally, Xu et al. (2023b) and Anand et al. (2023) label the real questions sampled from forums like Quora and Stack Overflow. Moreover, the process of labeling could be guided by instructions Ior demonstrations c. A commonly used in- struction type for guiding labeling is chain-of-thought (CoT) prompt (Hsieh et al., 2023; Fu et al., 2023; Magister et al., 2023). Mukherjee et al. (2023) add multiple system messages (e.g. \u201cYou must generate a detailed and long a...\n", "\n", "PROCESSED TEXT:\n", - "023b) and Anand et al. (2023) label the real questions sampled from forums like Quora and Stack Overflow. Moreover, the process of labeling could be guided by instructions or demonstrations. A commonly used instruction type for guiding labeling is the chain-of-thought (CoT) prompt. Mukherjee et al. (2023) add multiple system messages (e.g. “You must generate a detailed and long answer.” or “explain like I’m five, think step-by-step”) to elicit rich signals. Yue et al. (2023a) and Chenglin et al....\n", + "023b) and Anand et al. (2023) label the real questions sampled from forums like Quora and Stack Overflow. Moreover, the process of labeling could be guided by instructions or demonstrations. A commonly used instruction type for guiding labeling is the chain-of-thought (CoT) prompt. Mukherjee et al. (2023) add multiple system messages (e.g. \u201cYou must generate a detailed and long answer.\u201d or \u201cexplain like I\u2019m five, think step-by-step\u201d) to elicit rich signals. Yue et al. (2023a) and Chenglin et al....\n", "==========================================================================================\n", "\n" ] @@ -1410,7 +1410,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "Generate≻≻𝑦\" 𝑦! 𝑦# 𝑥 𝑥& CorrectExpand𝑐 Fig. 5: An illustration of different knowledge elicitation methods from teacher LLMs. Labeling : The teacher generates the output from the input; Expansion : The teacher generates samples similar to the given demonstrations through in- context learning; Data Curation : The teacher synthesizes data according to meta-information, such as a topic or an entity; Feature : Feed the data into the teacher and extract its internal knowledge, such as logits and featu...\n", + "Generate\u227b\u227b\ud835\udc66\" \ud835\udc66! \ud835\udc66# \ud835\udc65 \ud835\udc65& CorrectExpand\ud835\udc50 Fig. 5: An illustration of different knowledge elicitation methods from teacher LLMs. Labeling : The teacher generates the output from the input; Expansion : The teacher generates samples similar to the given demonstrations through in- context learning; Data Curation : The teacher synthesizes data according to meta-information, such as a topic or an entity; Feature : Feed the data into the teacher and extract its internal knowledge, such as logits and featu...\n", "\n", "PROCESSED TEXT:\n", "utput from input; Teacher generates samples similar to given demonstrations through in-context learning; Data is curated according to meta-information such as topic or entity; Data is fed into the teacher to extract knowledge such as logits and features; Teacher provides feedback on student's output such as preferences, corrections, and expansions of challenging samples; Student generates outputs which is then filtered for high-quality or evaluated by student itself\"...\n", @@ -1450,7 +1450,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "the existing dataset, in the expansion approach, both x andyare generated by teacher LLMs. This process can be formulated as follows: D(exp)={(x, y)|x∼pT(x|I⊕c), y∼pT(y|I⊕x)}.(4) In this formulation, xand yrepresent the new input- output pairs generated by the teacher LLM. The input x is generated based on a set of input-output demonstrations c. The output yis then generated in response to the new input xunder the guidance of an instruction I. Note thatthe demonstrations could be predefined or d...\n", + "the existing dataset, in the expansion approach, both x andyare generated by teacher LLMs. This process can be formulated as follows: D(exp)={(x, y)|x\u223cpT(x|I\u2295c), y\u223cpT(y|I\u2295x)}.(4) In this formulation, xand yrepresent the new input- output pairs generated by the teacher LLM. The input x is generated based on a set of input-output demonstrations c. The output yis then generated in response to the new input xunder the guidance of an instruction I. Note thatthe demonstrations could be predefined or d...\n", "\n", "PROCESSED TEXT:\n", "...\n", @@ -1532,10 +1532,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "approach to synthesize data from scratch. Numerous diverse meta- information, such as topics or knowledge points, could be incorporated into this process to generate controllable x andy. Thus, this process can be meticulously controlled to yield datasets that are not only large in scale but also of high quality. The formulation for Data Curation can be represented as: D(cur)={(x, y)|x∼pT(x|I⊕m), y∼pT(y|I⊕x)}.(5) In this formulation, mrepresents the diverse meta- information used to guide the syn...\n", + "approach to synthesize data from scratch. Numerous diverse meta- information, such as topics or knowledge points, could be incorporated into this process to generate controllable x andy. Thus, this process can be meticulously controlled to yield datasets that are not only large in scale but also of high quality. The formulation for Data Curation can be represented as: D(cur)={(x, y)|x\u223cpT(x|I\u2295m), y\u223cpT(y|I\u2295x)}.(5) In this formulation, mrepresents the diverse meta- information used to guide the syn...\n", "\n", "PROCESSED TEXT:\n", - "edge points, could be incorporated into this process to generate controllable output. Thus, this process can be meticulously controlled to yield datasets that are not only large in scale but also of high quality. The formulation for Data Curation can be represented as: D(cur)={(x, y)|x∼pT(x|I⊕m), y∼pT(y|I⊕x)}. In this formulation, mrepresents the diverse meta-information used to guide the synthesis of x, and Iis the instruction guiding teacher LLMs to generate xory. Different studies primarily v...\n", + "edge points, could be incorporated into this process to generate controllable output. Thus, this process can be meticulously controlled to yield datasets that are not only large in scale but also of high quality. The formulation for Data Curation can be represented as: D(cur)={(x, y)|x\u223cpT(x|I\u2295m), y\u223cpT(y|I\u2295x)}. In this formulation, mrepresents the diverse meta-information used to guide the synthesis of x, and Iis the instruction guiding teacher LLMs to generate xory. Different studies primarily v...\n", "==========================================================================================\n", "\n" ] @@ -1552,7 +1552,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "the World , they explore 30 meta-topics like ”Technology” and ”Food and Drink.” the teacher LLMs then use this meta-information to distill a broad array of instructions and conversations, achieving a substantial scale of 1.5 million instances. UltraChat stands out with its lexical and topical diversity. The UltraLLaMA model, fine- tuned on this data, consistently surpasses other open-source models. Another notable series, phi(Gunasekar et al., 2023; Li et al., 2023a; Mar, 2023), focuses on disti...\n", + "the World , they explore 30 meta-topics like \u201dTechnology\u201d and \u201dFood and Drink.\u201d the teacher LLMs then use this meta-information to distill a broad array of instructions and conversations, achieving a substantial scale of 1.5 million instances. UltraChat stands out with its lexical and topical diversity. The UltraLLaMA model, fine- tuned on this data, consistently surpasses other open-source models. Another notable series, phi(Gunasekar et al., 2023; Li et al., 2023a; Mar, 2023), focuses on disti...\n", "\n", "PROCESSED TEXT:\n", "ion to distill a broad array of instructions and conversations, resulting in a substantial scale of 1.5 million instances. UltraChat stands out with its lexical and topical diversity, fine-tuned on this data to consistently surpass other open-source models....\n", @@ -1632,7 +1632,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "(such as output distri- bution) from the teacher LLM. 10 The most straightforward method to elicit feature knowl- edge of teacher is to label a fixed dataset of sequences with token-level probability distributions (Sanh et al., 2019; Wen et al., 2023). To leverage the rich semantic and syntactic knowledge in intermediate layers of the teacher model, TED (Liang et al., 2023a) designs task-aware layer-wise distillation. They align the student’s hidden representations with those of the teacher at e...\n", + "(such as output distri- bution) from the teacher LLM. 10 The most straightforward method to elicit feature knowl- edge of teacher is to label a fixed dataset of sequences with token-level probability distributions (Sanh et al., 2019; Wen et al., 2023). To leverage the rich semantic and syntactic knowledge in intermediate layers of the teacher model, TED (Liang et al., 2023a) designs task-aware layer-wise distillation. They align the student\u2019s hidden representations with those of the teacher at e...\n", "\n", "PROCESSED TEXT:\n", "d to elicit feature knowledge of teacher is to label a fixed dataset of sequences with token-level probability distributions. TED (Liang et al., 2023a) designs task-aware layer-wise distillation. They align the student's hidden representations with those of the teacher at each layer, selectively extracting knowledge pertinent to the target task. Gu et al. (2024) and Agarwal et al. (2024) introduce a novel approach where the student model generates sequences, termed'self-generated sequences'. The...\n", @@ -1692,12 +1692,12 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "through Reinforcement Learning from AI Feedback (RLAIF) (Bai et al., 2022a). Here is a generalized formulation for eliciting feedback knowledge: D(fb)={(x, y, ϕ fb(x, y;θT))|x∼ X, y∼pS(y|x)}, (7) where ydenotes the output generated by the student model in response to x, and ϕfb(·;θT))represents providing feedback from teacher LLMs. This operation evaluates thestudent’s output ygiven the input x, by offering assess- ment, corrective information, or other forms of guidance. This feedback knowledge...\n", + "through Reinforcement Learning from AI Feedback (RLAIF) (Bai et al., 2022a). Here is a generalized formulation for eliciting feedback knowledge: D(fb)={(x, y, \u03d5 fb(x, y;\u03b8T))|x\u223c X, y\u223cpS(y|x)}, (7) where ydenotes the output generated by the student model in response to x, and \u03d5fb(\u00b7;\u03b8T))represents providing feedback from teacher LLMs. This operation evaluates thestudent\u2019s output ygiven the input x, by offering assess- ment, corrective information, or other forms of guidance. This feedback knowledge...\n", "\n", "PROCESSED TEXT:\n", "2022a). This generalized formulation for eliciting feedback knowledge involves the following steps: \n", "\n", - "1. D(fb)={(x, y, ϕ fb(x, y;θT))|x∼ X, y∼pS(y|x)}, where ydenotes the output generated by the student model in response to x, and ϕfb(·;θT))represents providing feedback from teacher LLMs. This operation evaluates the student’s output ygiven the input x, by offering assessment, corrective information, or other forms of guidance. This feedback knowledge enables the student to refine its responses ...\n", + "1. D(fb)={(x, y, \u03d5 fb(x, y;\u03b8T))|x\u223c X, y\u223cpS(y|x)}, where ydenotes the output generated by the student model in response to x, and \u03d5fb(\u00b7;\u03b8T))represents providing feedback from teacher LLMs. This operation evaluates the student\u2019s output ygiven the input x, by offering assessment, corrective information, or other forms of guidance. This feedback knowledge enables the student to refine its responses ...\n", "==========================================================================================\n", "\n" ] @@ -1737,7 +1737,7 @@ "various instructions and models to produce comparative data. Then, GPT-4 is used to score candidates from various aspects of preference, including instruction-following, truthfulness, honesty and helpfulness. Beyond merely assessing student generations, teachers can also furnish extensive feedback on instances where students underperform. In Lion (Jiang et al., 2023b), teacher model pinpoints instructions that pose challenges to the student model, generating new, more difficult instructions aime...\n", "\n", "PROCESSED TEXT:\n", - "s from various aspects of preference, including instruction-following, truthfulness, honesty and helpfulness. Beyond merely assessing student generations, teachers can also furnish extensive feedback on instances where students underperform. In Lion (Jiang et al., 2023b), teacher model pinpoints instructions that pose challenges to the student model, generating new, more difficult instructions aimed at bolstering the student’s abilities. PERsD (Chen et al., 2023a) showcases a method where teache...\n", + "s from various aspects of preference, including instruction-following, truthfulness, honesty and helpfulness. Beyond merely assessing student generations, teachers can also furnish extensive feedback on instances where students underperform. In Lion (Jiang et al., 2023b), teacher model pinpoints instructions that pose challenges to the student model, generating new, more difficult instructions aimed at bolstering the student\u2019s abilities. PERsD (Chen et al., 2023a) showcases a method where teache...\n", "==========================================================================================\n", "\n" ] @@ -1754,10 +1754,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "teacher model’s distribution over the student’s generations can itself act as a form of feedback. MiniLLM (Gu et al., 2024) and GKD (Agarwal et al., 2024) present an innovative strategy wherein the student model initially generates sequences, followed by teacher model producing an output distribution as feedback. This method leverages the teacher’s insight to directly inform and refine the student model’s learning process. 3.1.6 Self-Knowledge The knowledge could also be elicited from the studen...\n", + "teacher model\u2019s distribution over the student\u2019s generations can itself act as a form of feedback. MiniLLM (Gu et al., 2024) and GKD (Agarwal et al., 2024) present an innovative strategy wherein the student model initially generates sequences, followed by teacher model producing an output distribution as feedback. This method leverages the teacher\u2019s insight to directly inform and refine the student model\u2019s learning process. 3.1.6 Self-Knowledge The knowledge could also be elicited from the studen...\n", "\n", "PROCESSED TEXT:\n", - "iniLLM and GKD present an innovative strategy wherein the student model generates sequences, followed by the teacher model producing an output distribution as feedback. This method leverages the teacher’s insight to directly inform and refine the student model’s learning process. 3.1.6 Self-Knowledge The knowledge can be elicited from the student itself, which we refer to as Self-Knowledge. In this setting, the same model acts both as the teacher and the student, iteratively improving itself by ...\n", + "iniLLM and GKD present an innovative strategy wherein the student model generates sequences, followed by the teacher model producing an output distribution as feedback. This method leverages the teacher\u2019s insight to directly inform and refine the student model\u2019s learning process. 3.1.6 Self-Knowledge The knowledge can be elicited from the student itself, which we refer to as Self-Knowledge. In this setting, the same model acts both as the teacher and the student, iteratively improving itself by ...\n", "==========================================================================================\n", "\n" ] @@ -1774,7 +1774,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "self-knowledge could be formulated as: D(sk)={(x, y, ϕ sk(x, y))|x∼ S, y∼pS(y|I⊕x)},(8) where ϕsk(·)is a generalized function that represents an additional process to the self-generated outputs y, which could include but is not limited to filtering, rewarding, or any other mechanisms for enhancing or evaluating y. It could be governed by external tools or the student itself θS. Recent research in this area has proposed various innovative methodologies to elicit self-knowledge, demonstrating its ...\n", + "self-knowledge could be formulated as: D(sk)={(x, y, \u03d5 sk(x, y))|x\u223c S, y\u223cpS(y|I\u2295x)},(8) where \u03d5sk(\u00b7)is a generalized function that represents an additional process to the self-generated outputs y, which could include but is not limited to filtering, rewarding, or any other mechanisms for enhancing or evaluating y. It could be governed by external tools or the student itself \u03b8S. Recent research in this area has proposed various innovative methodologies to elicit self-knowledge, demonstrating its ...\n", "\n", "PROCESSED TEXT:\n", "at represents an additional process to the self-generated outputs y, which could include but is not limited to filtering, rewarding, or any other mechanisms for enhancing or evaluating y....\n", @@ -1834,10 +1834,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "and filtered using a scoring function. Subsequently, the lan- guage model undergoes fine-tuning on this curated dataset,employing an offline RL objective. Self-Play (Chen et al., 2024a) introduces a framework resembling iterative DPO, where the language model is fine-tuned to differentiate the self-generated responses from the human-annotated data. These self-generated responses could be seen as “negative knowledge” to promote the student to better align with the target distribution. Self-Reward...\n", + "and filtered using a scoring function. Subsequently, the lan- guage model undergoes fine-tuning on this curated dataset,employing an offline RL objective. Self-Play (Chen et al., 2024a) introduces a framework resembling iterative DPO, where the language model is fine-tuned to differentiate the self-generated responses from the human-annotated data. These self-generated responses could be seen as \u201cnegative knowledge\u201d to promote the student to better align with the target distribution. Self-Reward...\n", "\n", "PROCESSED TEXT:\n", - "this curated dataset, employing an offline RL objective. Self-Play (Chen et al., 2024a) introduces a framework resembling iterative DPO, where the language model is fine-tuned to differentiate the self-generated responses from the human-annotated data. These self-generated responses could be seen as “negative knowledge” to promote the student to better align with the target distribution. Self-Rewarding (Yuan et al., 2024a) explores a novel and promising approach by utilizing the language model i...\n", + "this curated dataset, employing an offline RL objective. Self-Play (Chen et al., 2024a) introduces a framework resembling iterative DPO, where the language model is fine-tuned to differentiate the self-generated responses from the human-annotated data. These self-generated responses could be seen as \u201cnegative knowledge\u201d to promote the student to better align with the target distribution. Self-Rewarding (Yuan et al., 2024a) explores a novel and promising approach by utilizing the language model i...\n", "==========================================================================================\n", "\n" ] @@ -1874,7 +1874,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "LLMs (Taori et al., 2023; Chiang et al., 2023; Wu et al., 2023c; Xu et al., 2023a; Luo et al., 2023b). Additionally, SFT has been ex- plored in many self-distillation works (Wang et al., 2022a; Huang et al., 2023c; Xu et al., 2023b; Zelikman et al., 2022). Due to the large number of KD works applying SFT, we only list representative ones here. More detailed works can be found in §4. 3.2.2 Divergence and Similarity This section mainly concentrates on algorithms designed for distilling feature kno...\n", + "LLMs (Taori et al., 2023; Chiang et al., 2023; Wu et al., 2023c; Xu et al., 2023a; Luo et al., 2023b). Additionally, SFT has been ex- plored in many self-distillation works (Wang et al., 2022a; Huang et al., 2023c; Xu et al., 2023b; Zelikman et al., 2022). Due to the large number of KD works applying SFT, we only list representative ones here. More detailed works can be found in \u00a74. 3.2.2 Divergence and Similarity This section mainly concentrates on algorithms designed for distilling feature kno...\n", "\n", "PROCESSED TEXT:\n", "works....\n", @@ -1894,11 +1894,11 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "log2p(t) p(t)+q(t)+Pq(t) log2q(t) p(t)+q(t)\u0011 TABLE 1: Functional forms of Dfor various divergence types. p: reference Similarity Function LF Expression L2-Norm Distance ∥ΦT(fT(x, y))−ΦS(fS(x, y))∥2 L1-Norm Distance ∥ΦT(fT(x, y))−ΦS(fS(x, y))∥1 Cross-Entropy Loss −PΦT(fT(x, y)) log(Φ S(fS(x, y))) Maximum Mean Discrepancy MMD (ΦT(fT(x, y)),ΦS(fS(x, y))) TABLE 2: Summary of similarity functions in knowledge distillation. and student models, represented by a general divergence function D: LDiv= E x∼...\n", + "log2p(t) p(t)+q(t)+Pq(t) log2q(t) p(t)+q(t)\u0011 TABLE 1: Functional forms of Dfor various divergence types. p: reference Similarity Function LF Expression L2-Norm Distance \u2225\u03a6T(fT(x, y))\u2212\u03a6S(fS(x, y))\u22252 L1-Norm Distance \u2225\u03a6T(fT(x, y))\u2212\u03a6S(fS(x, y))\u22251 Cross-Entropy Loss \u2212P\u03a6T(fT(x, y)) log(\u03a6 S(fS(x, y))) Maximum Mean Discrepancy MMD (\u03a6T(fT(x, y)),\u03a6S(fS(x, y))) TABLE 2: Summary of similarity functions in knowledge distillation. and student models, represented by a general divergence function D: LDiv= E x\u223c...\n", "\n", "PROCESSED TEXT:\n", "types\n", - "p: reference Similarity Function L2-Norm Distance ∥ΦT(fT(x, y))−ΦS(fS(x, y))∥2 L1-Norm Distance ∥ΦT(fT(x, y))−ΦS(fS(x, y))∥1 Cross-Entropy Loss −PΦT(fT(x, y)) log(Φ S(fS(x, y))) Maximum Mean Discrepancy MMD (ΦT(fT(x, y)),ΦS(fS(x, y)))...\n", + "p: reference Similarity Function L2-Norm Distance \u2225\u03a6T(fT(x, y))\u2212\u03a6S(fS(x, y))\u22252 L1-Norm Distance \u2225\u03a6T(fT(x, y))\u2212\u03a6S(fS(x, y))\u22251 Cross-Entropy Loss \u2212P\u03a6T(fT(x, y)) log(\u03a6 S(fS(x, y))) Maximum Mean Discrepancy MMD (\u03a6T(fT(x, y)),\u03a6S(fS(x, y)))...\n", "==========================================================================================\n", "\n" ] @@ -1915,12 +1915,12 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "modes of pT. However, when a student model is unable to learn all modes of a highly complex teacher, the re- sultant “mode-covering” behavior might cause the student to assign probability mass to tokens with low probability under the teacher’s distribution (cf. Figure 6 blue curve). This mode-covering phenomenon can potentially lead to hallucinations and low-quality generations. Alternatively, mode-seeking divergences like reverse KL prioritize tokens where the teacher assigns high probabilities...\n", + "modes of pT. However, when a student model is unable to learn all modes of a highly complex teacher, the re- sultant \u201cmode-covering\u201d behavior might cause the student to assign probability mass to tokens with low probability under the teacher\u2019s distribution (cf. Figure 6 blue curve). This mode-covering phenomenon can potentially lead to hallucinations and low-quality generations. Alternatively, mode-seeking divergences like reverse KL prioritize tokens where the teacher assigns high probabilities...\n", "\n", "PROCESSED TEXT:\n", "probability mass to tokens with low probability under the teacher's distribution. This can result in hallucinations and low-quality generations. \n", "\n", - "mode-seeking divergences, such as reverse KL, prioritize tokens with high probabilities, mitigating the risk of low-quality outputs. However, they often come at the cost of reduced diversity. Gu et al. (2024) use policy gradient methods to optimize for this approach, while Agarwal et al. (2024) and Sason and Verd´u (2016) assess the efficacy of differ...\n", + "mode-seeking divergences, such as reverse KL, prioritize tokens with high probabilities, mitigating the risk of low-quality outputs. However, they often come at the cost of reduced diversity. Gu et al. (2024) use policy gradient methods to optimize for this approach, while Agarwal et al. (2024) and Sason and Verd\u00b4u (2016) assess the efficacy of differ...\n", "==========================================================================================\n", "\n" ] @@ -1957,7 +1957,7 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "“mode-seeking” behavior. model with those of the teacher. These methods use various similarity metrics to measure and optimize the congruence of internal representations between the two models. The objective is to ensure that the student model not only produces similar outputs to the teacher but also processes information in a comparable manner. The formulation for a similarity-based objective might look like this: LSim= E x∼X,y∼Y[LF(ΦT(fT(x, y)),ΦS(fS(x, y)))],(11) where fT(x, y)andfS(x, y)are ...\n", + "\u201cmode-seeking\u201d behavior. model with those of the teacher. These methods use various similarity metrics to measure and optimize the congruence of internal representations between the two models. The objective is to ensure that the student model not only produces similar outputs to the teacher but also processes information in a comparable manner. The formulation for a similarity-based objective might look like this: LSim= E x\u223cX,y\u223cY[LF(\u03a6T(fT(x, y)),\u03a6S(fS(x, y)))],(11) where fT(x, y)andfS(x, y)are ...\n", "\n", "PROCESSED TEXT:\n", "ntations...\n", @@ -1999,10 +1999,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "2024b; Ma et al., 2023a; Pang et al., 2023; Du et al., 2023a). The RL-based distillation process typically involves two main stages: 13 Distilled Reward Model Training. The first stage involves training a reward model rϕusing the feedback data D(fd) generated by teacher LLMs. Preference data, as one of the typical feedback, is employed to train the student reward model (Bai et al., 2022a; Cui et al., 2023a; Lee et al., 2023a; Kim et al., 2023a). They usually consist of input-output pairs (x, yw,...\n", + "2024b; Ma et al., 2023a; Pang et al., 2023; Du et al., 2023a). The RL-based distillation process typically involves two main stages: 13 Distilled Reward Model Training. The first stage involves training a reward model r\u03d5using the feedback data D(fd) generated by teacher LLMs. Preference data, as one of the typical feedback, is employed to train the student reward model (Bai et al., 2022a; Cui et al., 2023a; Lee et al., 2023a; Kim et al., 2023a). They usually consist of input-output pairs (x, yw,...\n", "\n", "PROCESSED TEXT:\n", - "3 Distilled Reward Model Training. First stage involves training a reward model ϕ using feedback data D(fd) generated by teacher LLMs. Preference data, one of typical feedback, is used to train the student reward model. This typically consists of input-output pairs (x, yw, yl). Here, ywandyl represent \"winning\" and \"losing\" outputs relative to the teacher's preferences. Loss function for the reward model is defined as: LRM(rϕ,D(fd)) = - E (x,yw,yl) ∼D(fd)[logσ(rϕ(x, yw) - rϕ(x, yl))]...\n", + "3 Distilled Reward Model Training. First stage involves training a reward model \u03d5 using feedback data D(fd) generated by teacher LLMs. Preference data, one of typical feedback, is used to train the student reward model. This typically consists of input-output pairs (x, yw, yl). Here, ywandyl represent \"winning\" and \"losing\" outputs relative to the teacher's preferences. Loss function for the reward model is defined as: LRM(r\u03d5,D(fd)) = - E (x,yw,yl) \u223cD(fd)[log\u03c3(r\u03d5(x, yw) - r\u03d5(x, yl))]...\n", "==========================================================================================\n", "\n" ] @@ -2019,10 +2019,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "model. It is trained on an erroneous solution rewriting data distilled from a teacher LLM. This distilled reward model can pro- duce token-level rewards for RL training. Reinforcement Learning Optimization. In the second stage, the student model, represented by a policy πθ, is optimized to maximize the expected reward as per the trained reward model. Simultaneously, it minimizes the divergence from a reference policy πref, typically the initial policy of the student model trained by SFT, control...\n", + "model. It is trained on an erroneous solution rewriting data distilled from a teacher LLM. This distilled reward model can pro- duce token-level rewards for RL training. Reinforcement Learning Optimization. In the second stage, the student model, represented by a policy \u03c0\u03b8, is optimized to maximize the expected reward as per the trained reward model. Simultaneously, it minimizes the divergence from a reference policy \u03c0ref, typically the initial policy of the student model trained by SFT, control...\n", "\n", "PROCESSED TEXT:\n", - "reward model can pro- duce token-level rewards for RL training. Reinforcement Learning Optimization. In the second stage, the student model, represented by a policy πθ, is optimized to maximize the expected reward as per the trained reward model. Simultaneously, it minimizes the divergence from a reference policy πref, typically the initial policy of the student model trained by SFT, controlled by a factor β. The RL objective is given by: max πθE x∼X,y∼πθ(y|x)[rϕ(x, y)]−βDKL[πθ(y|x)∥πref(y|x)] (...\n", + "reward model can pro- duce token-level rewards for RL training. Reinforcement Learning Optimization. In the second stage, the student model, represented by a policy \u03c0\u03b8, is optimized to maximize the expected reward as per the trained reward model. Simultaneously, it minimizes the divergence from a reference policy \u03c0ref, typically the initial policy of the student model trained by SFT, controlled by a factor \u03b2. The RL objective is given by: max \u03c0\u03b8E x\u223cX,y\u223c\u03c0\u03b8(y|x)[r\u03d5(x, y)]\u2212\u03b2DKL[\u03c0\u03b8(y|x)\u2225\u03c0ref(y|x)] (...\n", "==========================================================================================\n", "\n" ] @@ -2067,10 +2067,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "ranking optimization todistill teacher’s preferences into student models (Tunstall et al., 2023; Hong et al., 2023; Yuan et al., 2024a). Zephyr (Tunstall et al., 2023) utilizes Direct Preference Optimization (DPO) (Rafailov et al., 2023) to distill the preference alignment in teacher LLMs. DPO streamlines the objective of reinforcement learning (as in Eq. 13), which involves reward maximization with a KL-divergence constraint, into a single-stage policy training. Specifically, DPO’s training goa...\n", + "ranking optimization todistill teacher\u2019s preferences into student models (Tunstall et al., 2023; Hong et al., 2023; Yuan et al., 2024a). Zephyr (Tunstall et al., 2023) utilizes Direct Preference Optimization (DPO) (Rafailov et al., 2023) to distill the preference alignment in teacher LLMs. DPO streamlines the objective of reinforcement learning (as in Eq. 13), which involves reward maximization with a KL-divergence constraint, into a single-stage policy training. Specifically, DPO\u2019s training goa...\n", "\n", "PROCESSED TEXT:\n", - "ng et al., 2023; Yuan et al., 2024a). Zephyr (Tunstall et al., 2023) utilizes Direct Preference Optimization (DPO) (Rafailov et al., 2023) to distill the preference alignment in teacher LLMs. DPO streamlines the objective of reinforcement learning (as in Eq. 13), which involves reward maximization with a KL-divergence constraint, into a single-stage policy training. Specifically, DPO’s training goal is to maximize the following expectation: E (x,yw,yl)∼D fd logπθ(yw|x) πref(yw|x)−βlogπθ(yl|x) πr...\n", + "ng et al., 2023; Yuan et al., 2024a). Zephyr (Tunstall et al., 2023) utilizes Direct Preference Optimization (DPO) (Rafailov et al., 2023) to distill the preference alignment in teacher LLMs. DPO streamlines the objective of reinforcement learning (as in Eq. 13), which involves reward maximization with a KL-divergence constraint, into a single-stage policy training. Specifically, DPO\u2019s training goal is to maximize the following expectation: E (x,yw,yl)\u223cD fd log\u03c0\u03b8(yw|x) \u03c0ref(yw|x)\u2212\u03b2log\u03c0\u03b8(yl|x) \u03c0r...\n", "==========================================================================================\n", "\n" ] @@ -2087,10 +2087,10 @@ "output_type": "stream", "text": [ "INPUT TEXT:\n", - "LRRHF =X ri=3.0.0 +pypdf>=4.0.0 torch>=2.0.0 transformers>=4.46.0 accelerate>=0.27.0 diff --git a/end-to-end-use-cases/book-character-mindmap/server/requirements.txt b/end-to-end-use-cases/book-character-mindmap/server/requirements.txt index 10ad93068..d61796517 100644 --- a/end-to-end-use-cases/book-character-mindmap/server/requirements.txt +++ b/end-to-end-use-cases/book-character-mindmap/server/requirements.txt @@ -1,6 +1,5 @@ flask flask-cors -asyncio werkzeug vllm transformers diff --git a/end-to-end-use-cases/book-character-mindmap/server/server.py b/end-to-end-use-cases/book-character-mindmap/server/server.py index c134862bb..b1c4ac17e 100644 --- a/end-to-end-use-cases/book-character-mindmap/server/server.py +++ b/end-to-end-use-cases/book-character-mindmap/server/server.py @@ -5,7 +5,7 @@ from flask import Flask, jsonify, request from flask_cors import CORS from transformers import AutoTokenizer -from vllm import LLM, sampling_params, SamplingParams +from vllm import LLM, SamplingParams # Flask setup app = Flask(__name__) diff --git a/end-to-end-use-cases/github_triage/requirements.txt b/end-to-end-use-cases/github_triage/requirements.txt index 4c37b8f27..09ad93f59 100644 --- a/end-to-end-use-cases/github_triage/requirements.txt +++ b/end-to-end-use-cases/github_triage/requirements.txt @@ -3,4 +3,3 @@ plotly openai groq fpdf -plotly diff --git a/end-to-end-use-cases/research_paper_analyzer/requirements.txt b/end-to-end-use-cases/research_paper_analyzer/requirements.txt index 425d40e77..31ec13ce4 100644 --- a/end-to-end-use-cases/research_paper_analyzer/requirements.txt +++ b/end-to-end-use-cases/research_paper_analyzer/requirements.txt @@ -1,5 +1,4 @@ gradio together requests -pypdf2 -gradio +pypdf