From 25fa8f79e1040ed822545e4797db3730ace83717 Mon Sep 17 00:00:00 2001 From: groxaxo Date: Thu, 4 Sep 2025 03:28:13 +0000 Subject: [PATCH] Add MCP server implementation for Synthetic Data Kit\n\n- Created MCP server that exposes all Synthetic Data Kit functionality\n- Added test scripts and example usage\n- Updated pyproject.toml to include MCP server entry point\n- Added comprehensive documentation in MCP_README.md --- MCP_README.md | 107 +++++++++++ example_mcp_usage.py | 125 ++++++++++++ pyproject.toml | 1 + synthetic_data_kit/mcp_server.py | 317 +++++++++++++++++++++++++++++++ test_mcp_server.py | 49 +++++ 5 files changed, 599 insertions(+) create mode 100644 MCP_README.md create mode 100644 example_mcp_usage.py create mode 100644 synthetic_data_kit/mcp_server.py create mode 100644 test_mcp_server.py diff --git a/MCP_README.md b/MCP_README.md new file mode 100644 index 00000000..a9c46ec1 --- /dev/null +++ b/MCP_README.md @@ -0,0 +1,107 @@ +# Synthetic Data Kit MCP Server + +This directory contains an MCP (Model Context Protocol) server implementation for the Synthetic Data Kit. This allows MCP-compatible clients to interact with the Synthetic Data Kit through a standardized protocol. + +## Features + +The MCP server provides access to all major functionalities of the Synthetic Data Kit: + +1. **Document Ingestion** - Parse documents (PDF, HTML, YouTube, DOCX, PPT, TXT) into clean text +2. **Content Creation** - Generate QA pairs, summaries, and Chain of Thought examples +3. **Content Curation** - Clean and filter content based on quality +4. **Format Conversion** - Convert to different formats for fine-tuning +5. **System Check** - Verify LLM provider connectivity + +## Installation + +The MCP server is automatically installed when you install the Synthetic Data Kit in development mode: + +```bash +cd /path/to/synthetic-data-kit +pip install -e . +``` + +## Usage + +### Starting the MCP Server + +To start the MCP server directly: + +```bash +synthetic-data-kit-mcp +``` + +This will start the server and listen for MCP connections over stdio. + +### Using with an MCP Client + +The server can be used with any MCP-compatible client. For example, if you're using Claude Desktop or another MCP client, you can configure it to connect to this server. + +### Tools Available + +The server exposes the following tools: + +1. `sdk_ingest` - Parse documents into clean text +2. `sdk_create` - Generate content from text +3. `sdk_curate` - Clean and filter content based on quality +4. `sdk_save_as` - Convert to different formats for fine-tuning +5. `sdk_system_check` - Check if the selected LLM provider's server is running + +Each tool maps directly to the corresponding CLI command in the Synthetic Data Kit. + +### Prompts Available + +The server also provides prompts: + +1. `sdk-workflow` - A complete workflow for generating synthetic data + +## Example Usage + +Here's an example of how an MCP client might interact with the server: + +1. Client requests tool list → Server responds with available tools +2. Client calls `sdk_ingest` with a document path → Server runs `synthetic-data-kit ingest` +3. Client calls `sdk_create` with parameters → Server runs `synthetic-data-kit create` +4. Client calls `sdk_curate` to filter results → Server runs `synthetic-data-kit curate` +5. Client calls `sdk_save_as` to convert formats → Server runs `synthetic-data-kit save-as` + +## Development + +To test the MCP server: + +```bash +cd /path/to/synthetic-data-kit +python test_mcp_server.py +``` + +To run a complete example: + +```bash +cd /path/to/synthetic-data-kit +python example_mcp_usage.py +``` + +## Architecture + +The MCP server acts as a bridge between MCP clients and the Synthetic Data Kit CLI: + +``` +MCP Client ↔ MCP Server ↔ Synthetic Data Kit CLI +``` + +All commands are executed as subprocess calls to the CLI, ensuring full compatibility with existing functionality. + +## Configuration + +The MCP server uses the same configuration as the Synthetic Data Kit CLI. Make sure your `config.yaml` is properly set up before using the server. + +## Troubleshooting + +If you encounter issues: + +1. Ensure the Synthetic Data Kit is properly installed: `pip install -e .` +2. Verify the CLI works: `synthetic-data-kit --help` +3. Check that required dependencies are installed +4. Ensure your LLM provider (vLLM or API endpoint) is properly configured and running + +Note: For API endpoints, you'll need to set the appropriate API keys in your environment or configuration file. \ No newline at end of file diff --git a/example_mcp_usage.py b/example_mcp_usage.py new file mode 100644 index 00000000..34f37b27 --- /dev/null +++ b/example_mcp_usage.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating how to use the Synthetic Data Kit MCP server. +This script shows how to process a document using the MCP server. +""" + +import asyncio +import json +import sys +import os +from typing import Any, Dict, List + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +from mcp.types import TextContent + + +async def process_document_example(): + """Example of processing a document using the MCP server.""" + # Get the current working directory + cwd = os.getcwd() + + # Start the MCP server as a subprocess + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "synthetic_data_kit.mcp_server"], + cwd=cwd + ) + + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the session + await session.initialize() + + # Create a sample text file to process + sample_text = """ + Artificial Intelligence (AI) is a branch of computer science that aims to create software or machines that exhibit human-like intelligence. + This can include learning from experience, understanding natural language, solving problems, and recognizing patterns. + + Machine Learning (ML) is a subset of AI that focuses on algorithms and statistical models that enable computers to improve at tasks + with experience. Deep Learning is a further subset of ML that uses neural networks with multiple layers. + + Natural Language Processing (NLP) is another important area of AI that deals with the interaction between computers and humans + using natural language. It involves tasks like language translation, sentiment analysis, and text summarization. + + Computer Vision is yet another field that enables computers to interpret and understand visual information from the world, + including image and video recognition. + """ + + # Write the sample text to a file + with open("sample_document.txt", "w") as f: + f.write(sample_text) + + print("=== Processing Sample Document ===") + print("Sample document created: sample_document.txt") + + # 1. Use the ingest tool to process the document + print("\n1. Ingesting document...") + try: + result = await session.call_tool( + "sdk_ingest", + { + "input": "sample_document.txt", + "output_dir": "data/parsed" + } + ) + print(f"Ingest result: {result}") + except Exception as e: + print(f"Error during ingestion: {e}") + + # 2. Use the create tool to generate QA pairs + print("\n2. Creating QA pairs...") + try: + result = await session.call_tool( + "sdk_create", + { + "input": "data/parsed/sample_document.lance", + "content_type": "qa", + "num_pairs": 5 + } + ) + print(f"Create result: {result}") + except Exception as e: + print(f"Error during creation: {e}") + + # 3. Use the curate tool to filter content + print("\n3. Curating content...") + try: + result = await session.call_tool( + "sdk_curate", + { + "input": "data/generated/sample_document_qa_pairs.json", + "threshold": 7.0 + } + ) + print(f"Curate result: {result}") + except Exception as e: + print(f"Error during curation: {e}") + + # 4. Use the save-as tool to convert format + print("\n4. Saving in final format...") + try: + result = await session.call_tool( + "sdk_save_as", + { + "input": "data/curated/sample_document_cleaned.json", + "format": "alpaca" + } + ) + print(f"Save-as result: {result}") + except Exception as e: + print(f"Error during format conversion: {e}") + + print("\n=== Document Processing Complete ===") + + # Clean up sample files + try: + os.remove("sample_document.txt") + print("Cleaned up sample_document.txt") + except: + pass + + +if __name__ == "__main__": + asyncio.run(process_document_example()) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 639d14ef..d378c425 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,6 +67,7 @@ classifiers = [ [project.scripts] synthetic-data-kit = "synthetic_data_kit.cli:app" +synthetic-data-kit-mcp = "synthetic_data_kit.mcp_server:main" [tool.hatch.build.targets.wheel] packages = ["synthetic_data_kit"] diff --git a/synthetic_data_kit/mcp_server.py b/synthetic_data_kit/mcp_server.py new file mode 100644 index 00000000..0f45984d --- /dev/null +++ b/synthetic_data_kit/mcp_server.py @@ -0,0 +1,317 @@ +""" +MCP (Model Context Protocol) server for the Synthetic Data Kit. +This server allows MCP-compatible clients to interact with the Synthetic Data Kit. +""" + +import asyncio +import json +import logging +import os +import subprocess +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional + +from mcp.server import Server +from mcp.server.stdio import stdio_server +from mcp.types import ( + GetPromptResult, Prompt, PromptArgument, PromptMessage, TextContent, + Tool, ListRootsResult, Root +) + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Initialize the MCP server +server = Server("synthetic-data-kit-mcp") + +# Register tool definitions +@server.list_tools() +async def list_tools() -> List[Tool]: + """List all available tools in the Synthetic Data Kit.""" + return [ + Tool( + name="sdk_ingest", + description="Parse documents (PDF, HTML, YouTube, DOCX, PPT, TXT) into clean text", + inputSchema={ + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "File, URL, or directory to parse" + }, + "output_dir": { + "type": "string", + "description": "Where to save the output" + }, + "name": { + "type": "string", + "description": "Custom output filename (only for single files)" + } + }, + "required": ["input"] + } + ), + Tool( + name="sdk_create", + description="Generate content from text using local LLM inference", + inputSchema={ + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "File or directory to process" + }, + "content_type": { + "type": "string", + "enum": ["qa", "summary", "cot", "cot-enhance", "multimodal-qa"], + "description": "Type of content to generate" + }, + "output_dir": { + "type": "string", + "description": "Where to save the output" + }, + "num_pairs": { + "type": "integer", + "description": "Target number of QA pairs or CoT examples to generate" + }, + "model": { + "type": "string", + "description": "Model to use" + } + }, + "required": ["input"] + } + ), + Tool( + name="sdk_curate", + description="Clean and filter content based on quality", + inputSchema={ + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "Input file or directory to clean" + }, + "output": { + "type": "string", + "description": "Output file path (for single files) or directory (for directories)" + }, + "threshold": { + "type": "number", + "description": "Quality threshold (1-10)" + }, + "model": { + "type": "string", + "description": "Model to use" + } + }, + "required": ["input"] + } + ), + Tool( + name="sdk_save_as", + description="Convert to different formats for fine-tuning", + inputSchema={ + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "Input file or directory to convert" + }, + "format": { + "type": "string", + "enum": ["jsonl", "alpaca", "ft", "chatml"], + "description": "Output format" + }, + "storage": { + "type": "string", + "enum": ["json", "hf"], + "description": "Storage format" + }, + "output": { + "type": "string", + "description": "Output file path (for single files) or directory (for directories)" + } + }, + "required": ["input"] + } + ), + Tool( + name="sdk_system_check", + description="Check if the selected LLM provider's server is running", + inputSchema={ + "type": "object", + "properties": { + "provider": { + "type": "string", + "enum": ["vllm", "api-endpoint"], + "description": "Provider to check" + }, + "api_base": { + "type": "string", + "description": "API base URL to check" + } + } + } + ) + ] + +@server.call_tool() +async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]: + """Execute a tool from the Synthetic Data Kit.""" + logger.info(f"Calling tool: {name} with arguments: {arguments}") + + try: + # Map tool names to CLI commands + if name == "sdk_ingest": + cmd = ["synthetic-data-kit", "ingest"] + if "input" in arguments: + cmd.append(arguments["input"]) + if "output_dir" in arguments: + cmd.extend(["--output-dir", arguments["output_dir"]]) + if "name" in arguments: + cmd.extend(["--name", arguments["name"]]) + + elif name == "sdk_create": + cmd = ["synthetic-data-kit", "create"] + if "input" in arguments: + cmd.append(arguments["input"]) + if "content_type" in arguments: + cmd.extend(["--type", arguments["content_type"]]) + if "output_dir" in arguments: + cmd.extend(["--output-dir", arguments["output_dir"]]) + if "num_pairs" in arguments: + cmd.extend(["--num-pairs", str(arguments["num_pairs"])]) + if "model" in arguments: + cmd.extend(["--model", arguments["model"]]) + + elif name == "sdk_curate": + cmd = ["synthetic-data-kit", "curate"] + if "input" in arguments: + cmd.append(arguments["input"]) + if "output" in arguments: + cmd.extend(["--output", arguments["output"]]) + if "threshold" in arguments: + cmd.extend(["--threshold", str(arguments["threshold"])]) + if "model" in arguments: + cmd.extend(["--model", arguments["model"]]) + + elif name == "sdk_save_as": + cmd = ["synthetic-data-kit", "save-as"] + if "input" in arguments: + cmd.append(arguments["input"]) + if "format" in arguments: + cmd.extend(["--format", arguments["format"]]) + if "storage" in arguments: + cmd.extend(["--storage", arguments["storage"]]) + if "output" in arguments: + cmd.extend(["--output", arguments["output"]]) + + elif name == "sdk_system_check": + cmd = ["synthetic-data-kit", "system-check"] + if "provider" in arguments: + cmd.extend(["--provider", arguments["provider"]]) + if "api_base" in arguments: + cmd.extend(["--api-base", arguments["api_base"]]) + else: + return [TextContent(type="text", text=f"Unknown tool: {name}")] + + # Execute the command + logger.info(f"Executing command: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd()) + + # Format the output + output = f"Command: {' '.join(cmd)}\n" + output += f"Exit code: {result.returncode}\n" + if result.stdout: + output += f"Stdout:\n{result.stdout}\n" + if result.stderr: + output += f"Stderr:\n{result.stderr}\n" + + return [TextContent(type="text", text=output)] + + except Exception as e: + logger.error(f"Error executing tool {name}: {str(e)}") + return [TextContent(type="text", text=f"Error executing tool {name}: {str(e)}")] + +@server.list_prompts() +async def list_prompts() -> List[Prompt]: + """List all available prompts.""" + return [ + Prompt( + name="sdk-workflow", + description="A complete workflow for generating synthetic data", + arguments=[ + PromptArgument( + name="document_path", + description="Path to the document to process", + required=True + ), + PromptArgument( + name="content_type", + description="Type of content to generate (qa, summary, cot)", + required=False + ), + PromptArgument( + name="num_pairs", + description="Number of QA pairs to generate", + required=False + ) + ] + ) + ] + +@server.get_prompt() +async def get_prompt(name: str, arguments: Dict[str, Any]) -> GetPromptResult: + """Get a prompt by name.""" + if name == "sdk-workflow": + document_path = arguments.get("document_path", "") + content_type = arguments.get("content_type", "qa") + num_pairs = arguments.get("num_pairs", 25) + + # Create a prompt that describes the workflow + messages = [ + PromptMessage( + role="user", + content=TextContent( + type="text", + text=f"Please help me process the document at '{document_path}' using the Synthetic Data Kit.\n" + f"I want to generate {content_type} content with {num_pairs} pairs/examples.\n\n" + f"Here's the workflow I'd like to follow:\n" + f"1. Ingest the document to extract clean text\n" + f"2. Create {content_type} content from the text\n" + f"3. Curate the generated content for quality\n" + f"4. Save the final result in an appropriate format\n\n" + f"Can you generate the appropriate commands for this workflow?" + ) + ) + ] + return GetPromptResult(messages=messages) + + raise ValueError(f"Unknown prompt: {name}") + +async def main(): + """Main entry point for the MCP server.""" + # Create the server session + async with stdio_server() as (read_stream, write_stream): + # Create initialization options + from mcp.server import InitializationOptions + from mcp.types import ServerCapabilities, ToolsCapability, PromptsCapability + + capabilities = ServerCapabilities( + tools=ToolsCapability(), + prompts=PromptsCapability() + ) + + initialization_options = InitializationOptions( + server_name="synthetic-data-kit", + server_version="0.1.0", + capabilities=capabilities + ) + + await server.run(read_stream, write_stream, initialization_options) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/test_mcp_server.py b/test_mcp_server.py new file mode 100644 index 00000000..72f5f7be --- /dev/null +++ b/test_mcp_server.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +Test script for the Synthetic Data Kit MCP server. +This script demonstrates how to interact with the MCP server. +""" + +import asyncio +import json +import sys +import os +from typing import Any, Dict, List + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +from mcp.types import TextContent + + +async def test_mcp_server(): + """Test the Synthetic Data Kit MCP server.""" + # Get the current working directory + cwd = os.getcwd() + + # Start the MCP server as a subprocess + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "synthetic_data_kit.mcp_server"], + cwd=cwd + ) + + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the session + await session.initialize() + + # Test 1: List tools + print("=== Testing tool listing ===") + tools = await session.list_tools() + print(f"Available tools: {tools}") + + # Test 2: List prompts + print("\n=== Testing prompt listing ===") + prompts = await session.list_prompts() + print(f"Available prompts: {prompts}") + + print("\n=== MCP Server Tests Complete ===") + + +if __name__ == "__main__": + asyncio.run(test_mcp_server())