From 25fa8f79e1040ed822545e4797db3730ace83717 Mon Sep 17 00:00:00 2001
From: groxaxo <groxaxo@example.com>
Date: Thu, 4 Sep 2025 03:28:13 +0000
Subject: [PATCH] Add MCP server implementation for Synthetic Data Kit\n\n-
 Created MCP server that exposes all Synthetic Data Kit functionality\n- Added
 test scripts and example usage\n- Updated pyproject.toml to include MCP
 server entry point\n- Added comprehensive documentation in MCP_README.md

---
 MCP_README.md                    | 107 +++++++++++
 example_mcp_usage.py             | 125 ++++++++++++
 pyproject.toml                   |   1 +
 synthetic_data_kit/mcp_server.py | 317 +++++++++++++++++++++++++++++++
 test_mcp_server.py               |  49 +++++
 5 files changed, 599 insertions(+)
 create mode 100644 MCP_README.md
 create mode 100644 example_mcp_usage.py
 create mode 100644 synthetic_data_kit/mcp_server.py
 create mode 100644 test_mcp_server.py

diff --git a/MCP_README.md b/MCP_README.md
new file mode 100644
index 00000000..a9c46ec1
--- /dev/null
+++ b/MCP_README.md
@@ -0,0 +1,107 @@
+# Synthetic Data Kit MCP Server
+
+This directory contains an MCP (Model Context Protocol) server implementation for the Synthetic Data Kit. This allows MCP-compatible clients to interact with the Synthetic Data Kit through a standardized protocol.
+
+## Features
+
+The MCP server provides access to all major functionalities of the Synthetic Data Kit:
+
+1. **Document Ingestion** - Parse documents (PDF, HTML, YouTube, DOCX, PPT, TXT) into clean text
+2. **Content Creation** - Generate QA pairs, summaries, and Chain of Thought examples
+3. **Content Curation** - Clean and filter content based on quality
+4. **Format Conversion** - Convert to different formats for fine-tuning
+5. **System Check** - Verify LLM provider connectivity
+
+## Installation
+
+The MCP server is automatically installed when you install the Synthetic Data Kit in development mode:
+
+```bash
+cd /path/to/synthetic-data-kit
+pip install -e .
+```
+
+## Usage
+
+### Starting the MCP Server
+
+To start the MCP server directly:
+
+```bash
+synthetic-data-kit-mcp
+```
+
+This will start the server and listen for MCP connections over stdio.
+
+### Using with an MCP Client
+
+The server can be used with any MCP-compatible client. For example, if you're using Claude Desktop or another MCP client, you can configure it to connect to this server.
+
+### Tools Available
+
+The server exposes the following tools:
+
+1. `sdk_ingest` - Parse documents into clean text
+2. `sdk_create` - Generate content from text 
+3. `sdk_curate` - Clean and filter content based on quality
+4. `sdk_save_as` - Convert to different formats for fine-tuning
+5. `sdk_system_check` - Check if the selected LLM provider's server is running
+
+Each tool maps directly to the corresponding CLI command in the Synthetic Data Kit.
+
+### Prompts Available
+
+The server also provides prompts:
+
+1. `sdk-workflow` - A complete workflow for generating synthetic data
+
+## Example Usage
+
+Here's an example of how an MCP client might interact with the server:
+
+1. Client requests tool list → Server responds with available tools
+2. Client calls `sdk_ingest` with a document path → Server runs `synthetic-data-kit ingest`
+3. Client calls `sdk_create` with parameters → Server runs `synthetic-data-kit create`
+4. Client calls `sdk_curate` to filter results → Server runs `synthetic-data-kit curate`
+5. Client calls `sdk_save_as` to convert formats → Server runs `synthetic-data-kit save-as`
+
+## Development
+
+To test the MCP server:
+
+```bash
+cd /path/to/synthetic-data-kit
+python test_mcp_server.py
+```
+
+To run a complete example:
+
+```bash
+cd /path/to/synthetic-data-kit
+python example_mcp_usage.py
+```
+
+## Architecture
+
+The MCP server acts as a bridge between MCP clients and the Synthetic Data Kit CLI:
+
+```
+MCP Client ↔ MCP Server ↔ Synthetic Data Kit CLI
+```
+
+All commands are executed as subprocess calls to the CLI, ensuring full compatibility with existing functionality.
+
+## Configuration
+
+The MCP server uses the same configuration as the Synthetic Data Kit CLI. Make sure your `config.yaml` is properly set up before using the server.
+
+## Troubleshooting
+
+If you encounter issues:
+
+1. Ensure the Synthetic Data Kit is properly installed: `pip install -e .`
+2. Verify the CLI works: `synthetic-data-kit --help`
+3. Check that required dependencies are installed
+4. Ensure your LLM provider (vLLM or API endpoint) is properly configured and running
+
+Note: For API endpoints, you'll need to set the appropriate API keys in your environment or configuration file.
\ No newline at end of file
diff --git a/example_mcp_usage.py b/example_mcp_usage.py
new file mode 100644
index 00000000..34f37b27
--- /dev/null
+++ b/example_mcp_usage.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating how to use the Synthetic Data Kit MCP server.
+This script shows how to process a document using the MCP server.
+"""
+
+import asyncio
+import json
+import sys
+import os
+from typing import Any, Dict, List
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+from mcp.types import TextContent
+
+
+async def process_document_example():
+    """Example of processing a document using the MCP server."""
+    # Get the current working directory
+    cwd = os.getcwd()
+    
+    # Start the MCP server as a subprocess
+    server_params = StdioServerParameters(
+        command=sys.executable,
+        args=["-m", "synthetic_data_kit.mcp_server"],
+        cwd=cwd
+    )
+    
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(read, write) as session:
+            # Initialize the session
+            await session.initialize()
+            
+            # Create a sample text file to process
+            sample_text = """
+            Artificial Intelligence (AI) is a branch of computer science that aims to create software or machines that exhibit human-like intelligence. 
+            This can include learning from experience, understanding natural language, solving problems, and recognizing patterns.
+            
+            Machine Learning (ML) is a subset of AI that focuses on algorithms and statistical models that enable computers to improve at tasks 
+            with experience. Deep Learning is a further subset of ML that uses neural networks with multiple layers.
+            
+            Natural Language Processing (NLP) is another important area of AI that deals with the interaction between computers and humans 
+            using natural language. It involves tasks like language translation, sentiment analysis, and text summarization.
+            
+            Computer Vision is yet another field that enables computers to interpret and understand visual information from the world, 
+            including image and video recognition.
+            """
+            
+            # Write the sample text to a file
+            with open("sample_document.txt", "w") as f:
+                f.write(sample_text)
+            
+            print("=== Processing Sample Document ===")
+            print("Sample document created: sample_document.txt")
+            
+            # 1. Use the ingest tool to process the document
+            print("\n1. Ingesting document...")
+            try:
+                result = await session.call_tool(
+                    "sdk_ingest",
+                    {
+                        "input": "sample_document.txt",
+                        "output_dir": "data/parsed"
+                    }
+                )
+                print(f"Ingest result: {result}")
+            except Exception as e:
+                print(f"Error during ingestion: {e}")
+            
+            # 2. Use the create tool to generate QA pairs
+            print("\n2. Creating QA pairs...")
+            try:
+                result = await session.call_tool(
+                    "sdk_create",
+                    {
+                        "input": "data/parsed/sample_document.lance",
+                        "content_type": "qa",
+                        "num_pairs": 5
+                    }
+                )
+                print(f"Create result: {result}")
+            except Exception as e:
+                print(f"Error during creation: {e}")
+            
+            # 3. Use the curate tool to filter content
+            print("\n3. Curating content...")
+            try:
+                result = await session.call_tool(
+                    "sdk_curate",
+                    {
+                        "input": "data/generated/sample_document_qa_pairs.json",
+                        "threshold": 7.0
+                    }
+                )
+                print(f"Curate result: {result}")
+            except Exception as e:
+                print(f"Error during curation: {e}")
+            
+            # 4. Use the save-as tool to convert format
+            print("\n4. Saving in final format...")
+            try:
+                result = await session.call_tool(
+                    "sdk_save_as",
+                    {
+                        "input": "data/curated/sample_document_cleaned.json",
+                        "format": "alpaca"
+                    }
+                )
+                print(f"Save-as result: {result}")
+            except Exception as e:
+                print(f"Error during format conversion: {e}")
+            
+            print("\n=== Document Processing Complete ===")
+            
+            # Clean up sample files
+            try:
+                os.remove("sample_document.txt")
+                print("Cleaned up sample_document.txt")
+            except:
+                pass
+
+
+if __name__ == "__main__":
+    asyncio.run(process_document_example())
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 639d14ef..d378c425 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ classifiers = [
 
 [project.scripts]
 synthetic-data-kit = "synthetic_data_kit.cli:app"
+synthetic-data-kit-mcp = "synthetic_data_kit.mcp_server:main"
 
 [tool.hatch.build.targets.wheel]
 packages = ["synthetic_data_kit"]
diff --git a/synthetic_data_kit/mcp_server.py b/synthetic_data_kit/mcp_server.py
new file mode 100644
index 00000000..0f45984d
--- /dev/null
+++ b/synthetic_data_kit/mcp_server.py
@@ -0,0 +1,317 @@
+"""
+MCP (Model Context Protocol) server for the Synthetic Data Kit.
+This server allows MCP-compatible clients to interact with the Synthetic Data Kit.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+from mcp.types import (
+    GetPromptResult, Prompt, PromptArgument, PromptMessage, TextContent,
+    Tool, ListRootsResult, Root
+)
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Initialize the MCP server
+server = Server("synthetic-data-kit-mcp")
+
+# Register tool definitions
+@server.list_tools()
+async def list_tools() -> List[Tool]:
+    """List all available tools in the Synthetic Data Kit."""
+    return [
+        Tool(
+            name="sdk_ingest",
+            description="Parse documents (PDF, HTML, YouTube, DOCX, PPT, TXT) into clean text",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "type": "string",
+                        "description": "File, URL, or directory to parse"
+                    },
+                    "output_dir": {
+                        "type": "string",
+                        "description": "Where to save the output"
+                    },
+                    "name": {
+                        "type": "string",
+                        "description": "Custom output filename (only for single files)"
+                    }
+                },
+                "required": ["input"]
+            }
+        ),
+        Tool(
+            name="sdk_create",
+            description="Generate content from text using local LLM inference",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "type": "string",
+                        "description": "File or directory to process"
+                    },
+                    "content_type": {
+                        "type": "string",
+                        "enum": ["qa", "summary", "cot", "cot-enhance", "multimodal-qa"],
+                        "description": "Type of content to generate"
+                    },
+                    "output_dir": {
+                        "type": "string",
+                        "description": "Where to save the output"
+                    },
+                    "num_pairs": {
+                        "type": "integer",
+                        "description": "Target number of QA pairs or CoT examples to generate"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "Model to use"
+                    }
+                },
+                "required": ["input"]
+            }
+        ),
+        Tool(
+            name="sdk_curate",
+            description="Clean and filter content based on quality",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "type": "string",
+                        "description": "Input file or directory to clean"
+                    },
+                    "output": {
+                        "type": "string",
+                        "description": "Output file path (for single files) or directory (for directories)"
+                    },
+                    "threshold": {
+                        "type": "number",
+                        "description": "Quality threshold (1-10)"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "Model to use"
+                    }
+                },
+                "required": ["input"]
+            }
+        ),
+        Tool(
+            name="sdk_save_as",
+            description="Convert to different formats for fine-tuning",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "type": "string",
+                        "description": "Input file or directory to convert"
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["jsonl", "alpaca", "ft", "chatml"],
+                        "description": "Output format"
+                    },
+                    "storage": {
+                        "type": "string",
+                        "enum": ["json", "hf"],
+                        "description": "Storage format"
+                    },
+                    "output": {
+                        "type": "string",
+                        "description": "Output file path (for single files) or directory (for directories)"
+                    }
+                },
+                "required": ["input"]
+            }
+        ),
+        Tool(
+            name="sdk_system_check",
+            description="Check if the selected LLM provider's server is running",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "provider": {
+                        "type": "string",
+                        "enum": ["vllm", "api-endpoint"],
+                        "description": "Provider to check"
+                    },
+                    "api_base": {
+                        "type": "string",
+                        "description": "API base URL to check"
+                    }
+                }
+            }
+        )
+    ]
+
+@server.call_tool()
+async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:
+    """Execute a tool from the Synthetic Data Kit."""
+    logger.info(f"Calling tool: {name} with arguments: {arguments}")
+    
+    try:
+        # Map tool names to CLI commands
+        if name == "sdk_ingest":
+            cmd = ["synthetic-data-kit", "ingest"]
+            if "input" in arguments:
+                cmd.append(arguments["input"])
+            if "output_dir" in arguments:
+                cmd.extend(["--output-dir", arguments["output_dir"]])
+            if "name" in arguments:
+                cmd.extend(["--name", arguments["name"]])
+                
+        elif name == "sdk_create":
+            cmd = ["synthetic-data-kit", "create"]
+            if "input" in arguments:
+                cmd.append(arguments["input"])
+            if "content_type" in arguments:
+                cmd.extend(["--type", arguments["content_type"]])
+            if "output_dir" in arguments:
+                cmd.extend(["--output-dir", arguments["output_dir"]])
+            if "num_pairs" in arguments:
+                cmd.extend(["--num-pairs", str(arguments["num_pairs"])])
+            if "model" in arguments:
+                cmd.extend(["--model", arguments["model"]])
+                
+        elif name == "sdk_curate":
+            cmd = ["synthetic-data-kit", "curate"]
+            if "input" in arguments:
+                cmd.append(arguments["input"])
+            if "output" in arguments:
+                cmd.extend(["--output", arguments["output"]])
+            if "threshold" in arguments:
+                cmd.extend(["--threshold", str(arguments["threshold"])])
+            if "model" in arguments:
+                cmd.extend(["--model", arguments["model"]])
+                
+        elif name == "sdk_save_as":
+            cmd = ["synthetic-data-kit", "save-as"]
+            if "input" in arguments:
+                cmd.append(arguments["input"])
+            if "format" in arguments:
+                cmd.extend(["--format", arguments["format"]])
+            if "storage" in arguments:
+                cmd.extend(["--storage", arguments["storage"]])
+            if "output" in arguments:
+                cmd.extend(["--output", arguments["output"]])
+                
+        elif name == "sdk_system_check":
+            cmd = ["synthetic-data-kit", "system-check"]
+            if "provider" in arguments:
+                cmd.extend(["--provider", arguments["provider"]])
+            if "api_base" in arguments:
+                cmd.extend(["--api-base", arguments["api_base"]])
+        else:
+            return [TextContent(type="text", text=f"Unknown tool: {name}")]
+        
+        # Execute the command
+        logger.info(f"Executing command: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
+        
+        # Format the output
+        output = f"Command: {' '.join(cmd)}\n"
+        output += f"Exit code: {result.returncode}\n"
+        if result.stdout:
+            output += f"Stdout:\n{result.stdout}\n"
+        if result.stderr:
+            output += f"Stderr:\n{result.stderr}\n"
+            
+        return [TextContent(type="text", text=output)]
+        
+    except Exception as e:
+        logger.error(f"Error executing tool {name}: {str(e)}")
+        return [TextContent(type="text", text=f"Error executing tool {name}: {str(e)}")]
+
+@server.list_prompts()
+async def list_prompts() -> List[Prompt]:
+    """List all available prompts."""
+    return [
+        Prompt(
+            name="sdk-workflow",
+            description="A complete workflow for generating synthetic data",
+            arguments=[
+                PromptArgument(
+                    name="document_path",
+                    description="Path to the document to process",
+                    required=True
+                ),
+                PromptArgument(
+                    name="content_type",
+                    description="Type of content to generate (qa, summary, cot)",
+                    required=False
+                ),
+                PromptArgument(
+                    name="num_pairs",
+                    description="Number of QA pairs to generate",
+                    required=False
+                )
+            ]
+        )
+    ]
+
+@server.get_prompt()
+async def get_prompt(name: str, arguments: Dict[str, Any]) -> GetPromptResult:
+    """Get a prompt by name."""
+    if name == "sdk-workflow":
+        document_path = arguments.get("document_path", "")
+        content_type = arguments.get("content_type", "qa")
+        num_pairs = arguments.get("num_pairs", 25)
+        
+        # Create a prompt that describes the workflow
+        messages = [
+            PromptMessage(
+                role="user",
+                content=TextContent(
+                    type="text",
+                    text=f"Please help me process the document at '{document_path}' using the Synthetic Data Kit.\n"
+                         f"I want to generate {content_type} content with {num_pairs} pairs/examples.\n\n"
+                         f"Here's the workflow I'd like to follow:\n"
+                         f"1. Ingest the document to extract clean text\n"
+                         f"2. Create {content_type} content from the text\n"
+                         f"3. Curate the generated content for quality\n"
+                         f"4. Save the final result in an appropriate format\n\n"
+                         f"Can you generate the appropriate commands for this workflow?"
+                )
+            )
+        ]
+        return GetPromptResult(messages=messages)
+    
+    raise ValueError(f"Unknown prompt: {name}")
+
+async def main():
+    """Main entry point for the MCP server."""
+    # Create the server session
+    async with stdio_server() as (read_stream, write_stream):
+        # Create initialization options
+        from mcp.server import InitializationOptions
+        from mcp.types import ServerCapabilities, ToolsCapability, PromptsCapability
+        
+        capabilities = ServerCapabilities(
+            tools=ToolsCapability(),
+            prompts=PromptsCapability()
+        )
+        
+        initialization_options = InitializationOptions(
+            server_name="synthetic-data-kit",
+            server_version="0.1.0",
+            capabilities=capabilities
+        )
+        
+        await server.run(read_stream, write_stream, initialization_options)
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/test_mcp_server.py b/test_mcp_server.py
new file mode 100644
index 00000000..72f5f7be
--- /dev/null
+++ b/test_mcp_server.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""
+Test script for the Synthetic Data Kit MCP server.
+This script demonstrates how to interact with the MCP server.
+"""
+
+import asyncio
+import json
+import sys
+import os
+from typing import Any, Dict, List
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+from mcp.types import TextContent
+
+
+async def test_mcp_server():
+    """Test the Synthetic Data Kit MCP server."""
+    # Get the current working directory
+    cwd = os.getcwd()
+    
+    # Start the MCP server as a subprocess
+    server_params = StdioServerParameters(
+        command=sys.executable,
+        args=["-m", "synthetic_data_kit.mcp_server"],
+        cwd=cwd
+    )
+    
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(read, write) as session:
+            # Initialize the session
+            await session.initialize()
+            
+            # Test 1: List tools
+            print("=== Testing tool listing ===")
+            tools = await session.list_tools()
+            print(f"Available tools: {tools}")
+            
+            # Test 2: List prompts
+            print("\n=== Testing prompt listing ===")
+            prompts = await session.list_prompts()
+            print(f"Available prompts: {prompts}")
+            
+            print("\n=== MCP Server Tests Complete ===")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_mcp_server())