From 9950f6f4cf44dac205be3ed4db8690a043e400ba Mon Sep 17 00:00:00 2001 From: dil Date: Mon, 28 Oct 2024 05:59:06 +0000 Subject: [PATCH 01/28] Initial version of MinIO event based dataprep for Milvus --- comps/dataprep/minio/__init__.py | 0 .../minio/milvus/langchain/Dockerfile | 38 + .../dataprep/minio/milvus/langchain/README.md | 252 ++++ .../minio/milvus/langchain/__init__.py | 2 + .../dataprep/minio/milvus/langchain/config.py | 25 + .../minio/milvus/langchain/docker-compose.yml | 69 ++ .../minio/milvus/langchain/milvus.yaml | 1031 +++++++++++++++++ .../milvus/langchain/prepare_doc_milvus.py | 491 ++++++++ .../minio/milvus/langchain/requirements.txt | 31 + comps/dataprep/minio/minio_schema.py | 76 ++ 10 files changed, 2015 insertions(+) create mode 100644 comps/dataprep/minio/__init__.py create mode 100644 comps/dataprep/minio/milvus/langchain/Dockerfile create mode 100644 comps/dataprep/minio/milvus/langchain/README.md create mode 100644 comps/dataprep/minio/milvus/langchain/__init__.py create mode 100644 comps/dataprep/minio/milvus/langchain/config.py create mode 100644 comps/dataprep/minio/milvus/langchain/docker-compose.yml create mode 100644 comps/dataprep/minio/milvus/langchain/milvus.yaml create mode 100644 comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py create mode 100644 comps/dataprep/minio/milvus/langchain/requirements.txt create mode 100644 comps/dataprep/minio/minio_schema.py diff --git a/comps/dataprep/minio/__init__.py b/comps/dataprep/minio/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/comps/dataprep/minio/milvus/langchain/Dockerfile b/comps/dataprep/minio/milvus/langchain/Dockerfile new file mode 100644 index 0000000000..8c68b10edf --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + tesseract-ocr + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/minio/milvus/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/minio/milvus/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/minio/milvus/langchain/uploaded_files + +USER user +WORKDIR /home/user/comps/dataprep/minio/milvus/langchain + +ENTRYPOINT ["python", "prepare_doc_milvus.py"] diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md new file mode 100644 index 0000000000..f349df54cb --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -0,0 +1,252 @@ +# Dataprep Microservice with Milvus + +## 🚀1. Start Microservice with Python (Option 1) + +### 1.1 Requirements + +```bash +pip install -r requirements.txt +apt-get install tesseract-ocr -y +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### 1.2 Start Milvus Server + +Please refer to this [readme](../../../vectorstores/milvus/README.md). + +### 1.3 Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export MILVUS_HOST=${your_milvus_host_ip} +export MILVUS_PORT=19530 +export COLLECTION_NAME=${your_collection_name} +export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} +``` + +### 1.4 Start Mosec Embedding Service + +First, you need to build a mosec embedding serving docker image. + +```bash +cd ../../.. +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . +``` + +Then start the mosec embedding server. + +```bash +your_port=6010 +docker run -d --name="embedding-mosec-endpoint" -p $your_port:8000 opea/embedding-mosec-endpoint:latest +``` + +Setup environment variables: + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS_HOST=${your_host_ip} +``` + +### 1.5 Start Document Preparation Microservice for Milvus with Python Script + +Start document preparation microservice for Milvus with below command. + +```bash +python prepare_doc_milvus.py +``` + +## 🚀2. Start Microservice with Docker (Option 2) + +### 2.1 Start Milvus Server + +Please refer to this [readme](../../../vectorstores/milvus/README.md). + +### 2.2 Build Docker Image + +```bash +cd ../../.. +# build mosec embedding docker image +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . +# build dataprep milvus docker image +docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/milvus/langchain/Dockerfile . +``` + +### 2.3 Setup Environment Variables + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS_HOST=${your_host_ip} +``` + +### 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} opea/dataprep-milvus:latest +``` + +### 2.4 Run with Docker Compose (Option B) + +```bash +mkdir model +cd model +git clone https://huggingface.co/BAAI/bge-base-en-v1.5 +cd ../ +# Update `host_ip` and `HUGGINGFACEHUB_API_TOKEN` in set_env.sh +. set_env.sh +docker compose -f docker-compose-dataprep-milvus.yaml up -d +``` + +## 🚀3. Consume Microservice + +### 3.1 Consume Upload API + +Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + +- Single file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + http://localhost:6010/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500). + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + -F "chunk_size=500" \ + -F "chunk_overlap=100" \ + http://localhost:6010/v1/dataprep +``` + +- Multiple file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.pdf" \ + -F "files=@./file2.pdf" \ + -F "files=@./file3.pdf" \ + http://localhost:6010/v1/dataprep +``` + +- Links upload (not supported for llama_index now) + +```bash +curl -X POST \ + -F 'link_list=["https://www.ces.tech/"]' \ + http://localhost:6010/v1/dataprep +``` + +or + +```python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:6010/v1/dataprep" +urls = [ + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" +] +payload = {"link_list": json.dumps(urls)} + +try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep +``` + +### 3.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6010/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 3.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete all files and links, will drop the entire db collection +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://localhost:6010/v1/dataprep/delete_file +``` + +## 🚀4. Troubleshooting + +1. If you get errors from Mosec Embedding Endpoint like `cannot find this task, maybe it has expired` while uploading files, try to reduce the `chunk_size` in the curl command like below (the default chunk_size=1500). + + ```bash + curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + -F "chunk_size=500" \ + http://localhost:6010/v1/dataprep + ``` diff --git a/comps/dataprep/minio/milvus/langchain/__init__.py b/comps/dataprep/minio/milvus/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/milvus/langchain/config.py b/comps/dataprep/minio/milvus/langchain/config.py new file mode 100644 index 0000000000..ef2a2366ea --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/config.py @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Local Embedding model +LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1") +# TEI Embedding endpoints +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +# MILVUS configuration +MILVUS_HOST = os.getenv("MILVUS_HOST", "localhost") +MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530)) +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus") +# MOSEC configuration +MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5") +MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "") +MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "localhost:9000") +MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") +MINIO_SECRET_KEY=os.environ.get("MINIO_SECRET_KEY", "minioadmin") +MINIO_SECURE=os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") +MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") +os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT +os.environ["OPENAI_API_KEY"] = "Dummy key" + diff --git a/comps/dataprep/minio/milvus/langchain/docker-compose.yml b/comps/dataprep/minio/milvus/langchain/docker-compose.yml new file mode 100644 index 0000000000..8dde3fed06 --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/docker-compose.yml @@ -0,0 +1,69 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.5 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "5044:9001" + - "5043:9000" + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.4.9 + command: ["milvus", "run", "standalone"] + security_opt: + - seccomp:unconfined + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + - ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + start_period: 90s + timeout: 20s + retries: 3 + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus diff --git a/comps/dataprep/minio/milvus/langchain/milvus.yaml b/comps/dataprep/minio/milvus/langchain/milvus.yaml new file mode 100644 index 0000000000..52962b8342 --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/milvus.yaml @@ -0,0 +1,1031 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Related configuration of etcd, used to store Milvus metadata & service discovery. +etcd: + # Endpoints used to access etcd service. You can change this parameter as the endpoints of your own etcd cluster. + # Environment variable: ETCD_ENDPOINTS + # etcd preferentially acquires valid address from environment variable ETCD_ENDPOINTS when Milvus is started. + endpoints: localhost:2379 + # Root prefix of the key to where Milvus stores data in etcd. + # It is recommended to change this parameter before starting Milvus for the first time. + # To share an etcd instance among multiple Milvus instances, consider changing this to a different value for each Milvus instance before you start them. + # Set an easy-to-identify root path for Milvus if etcd service already exists. + # Changing this for an already running Milvus instance may result in failures to read legacy data. + rootPath: by-dev + # Sub-prefix of the key to where Milvus stores metadata-related information in etcd. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + metaSubPath: meta + # Sub-prefix of the key to where Milvus stores timestamps in etcd. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended not to change this parameter if there is no specific reason. + kvSubPath: kv + log: + level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'. + # path is one of: + # - "default" as os.Stderr, + # - "stderr" as os.Stderr, + # - "stdout" as os.Stdout, + # - file path to append server logs to. + # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log + path: stdout + ssl: + enabled: false # Whether to support ETCD secure connection mode + tlsCert: /path/to/etcd-client.pem # path to your cert file + tlsKey: /path/to/etcd-client-key.pem # path to your key file + tlsCACert: /path/to/ca.pem # path to your CACert file + # TLS min version + # Optional values: 1.0, 1.1, 1.2, 1.3。 + # We recommend using version 1.2 and above. + tlsMinVersion: 1.3 + requestTimeout: 10000 # Etcd operation timeout in milliseconds + use: + embed: false # Whether to enable embedded Etcd (an in-process EtcdServer). + data: + dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/ + auth: + enabled: false # Whether to enable authentication + userName: # username for etcd authentication + password: # password for etcd authentication + +metastore: + type: etcd # Default value: etcd, Valid values: [etcd, tikv] + +# Related configuration of tikv, used to store Milvus metadata. +# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery. +# TiKV is a good option when the metadata size requires better horizontal scalability. +tikv: + endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd. + rootPath: by-dev # The root path where data is stored in tikv + metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath + kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath + requestTimeout: 10000 # ms, tikv request timeout + snapshotScanSize: 256 # batch size of tikv snapshot scan + ssl: + enabled: false # Whether to support TiKV secure connection mode + tlsCert: # path to your cert file + tlsKey: # path to your key file + tlsCACert: # path to your CACert file + +localStorage: + # Local path to where vector data are stored during a search or a query to avoid repetitve access to MinIO or S3 service. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + path: /var/lib/milvus/data/ + +# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus. +# We refer to the storage service as MinIO/S3 in the following description for simplicity. +minio: + # IP address of MinIO or S3 service. + # Environment variable: MINIO_ADDRESS + # minio.address and minio.port together generate the valid access to MinIO or S3 service. + # MinIO preferentially acquires the valid IP address from the environment variable MINIO_ADDRESS when Milvus is started. + # Default value applies when MinIO or S3 is running on the same network with Milvus. + address: localhost + port: 9000 # Port of MinIO or S3 service. + # Access key ID that MinIO or S3 issues to user for authorized access. + # Environment variable: MINIO_ACCESS_KEY_ID or minio.accessKeyID + # minio.accessKeyID and minio.secretAccessKey together are used for identity authentication to access the MinIO or S3 service. + # This configuration must be set identical to the environment variable MINIO_ACCESS_KEY_ID, which is necessary for starting MinIO or S3. + # The default value applies to MinIO or S3 service that started with the default docker-compose.yml file. + accessKeyID: minioadmin + # Secret key used to encrypt the signature string and verify the signature string on server. It must be kept strictly confidential and accessible only to the MinIO or S3 server and users. + # Environment variable: MINIO_SECRET_ACCESS_KEY or minio.secretAccessKey + # minio.accessKeyID and minio.secretAccessKey together are used for identity authentication to access the MinIO or S3 service. + # This configuration must be set identical to the environment variable MINIO_SECRET_ACCESS_KEY, which is necessary for starting MinIO or S3. + # The default value applies to MinIO or S3 service that started with the default docker-compose.yml file. + secretAccessKey: minioadmin + useSSL: false # Switch value to control if to access the MinIO or S3 service through SSL. + ssl: + tlsCACert: /path/to/public.crt # path to your CACert file + # Name of the bucket where Milvus stores data in MinIO or S3. + # Milvus 2.0.0 does not support storing data in multiple buckets. + # Bucket with this name will be created if it does not exist. If the bucket already exists and is accessible, it will be used directly. Otherwise, there will be an error. + # To share an MinIO instance among multiple Milvus instances, consider changing this to a different value for each Milvus instance before you start them. For details, see Operation FAQs. + # The data will be stored in the local Docker if Docker is used to start the MinIO service locally. Ensure that there is sufficient storage space. + # A bucket name is globally unique in one MinIO or S3 instance. + bucketName: a-bucket + # Root prefix of the key to where Milvus stores data in MinIO or S3. + # It is recommended to change this parameter before starting Milvus for the first time. + # To share an MinIO instance among multiple Milvus instances, consider changing this to a different value for each Milvus instance before you start them. For details, see Operation FAQs. + # Set an easy-to-identify root key prefix for Milvus if etcd service already exists. + # Changing this for an already running Milvus instance may result in failures to read legacy data. + rootPath: files + # Whether to useIAM role to access S3/GCS instead of access/secret keys + # For more information, refer to + # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html + # gcp: https://cloud.google.com/storage/docs/access-control/iam + # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control + # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role + useIAM: false + # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun". + # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio + # You can use "gcp" for other cloud provider supports S3 API with signature v2 + # You can use "aliyun" for other cloud provider uses virtual host style bucket + # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now + cloudProvider: aws + # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws". + # Leave it empty if you want to use AWS default endpoint + iamEndpoint: + logLevel: fatal # Log level for aws sdk log. Supported level: off, fatal, error, warn, info, debug, trace + region: # Specify minio storage system location region + useVirtualHost: false # Whether use virtual host mode for bucket + requestTimeoutMs: 10000 # minio timeout for request time in milliseconds + # The maximum number of objects requested per batch in minio ListObjects rpc, + # 0 means using oss client by default, decrease these configuration if ListObjects timeout + listObjectsMaxKeys: 0 + +# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka. +# You can change your mq by setting mq.type field. +# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file. +# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka +# 2. cluster mode: Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode) +mq: + # Default value: "default" + # Valid values: [default, pulsar, kafka, rocksmq, natsmq] + type: default + enablePursuitMode: true # Default value: "true" + pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds + pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes + mqBufSize: 16 # MQ client consumer buffer length + dispatcher: + mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge + targetBufSize: 16 # the length of channel buffer for targe + maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack + +# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services. +pulsar: + # IP address of Pulsar service. + # Environment variable: PULSAR_ADDRESS + # pulsar.address and pulsar.port together generate the valid access to Pulsar. + # Pulsar preferentially acquires the valid IP address from the environment variable PULSAR_ADDRESS when Milvus is started. + # Default value applies when Pulsar is running on the same network with Milvus. + address: localhost + port: 6650 # Port of Pulsar service. + webport: 80 # Web port of of Pulsar service. If you connect directly without proxy, should use 8080. + # The maximum size of each message in Pulsar. Unit: Byte. + # By default, Pulsar can transmit at most 5 MB of data in a single message. When the size of inserted data is greater than this value, proxy fragments the data into multiple messages to ensure that they can be transmitted correctly. + # If the corresponding parameter in Pulsar remains unchanged, increasing this configuration will cause Milvus to fail, and reducing it produces no advantage. + maxMessageSize: 5242880 + # Pulsar can be provisioned for specific tenants with appropriate capacity allocated to the tenant. + # To share a Pulsar instance among multiple Milvus instances, you can change this to an Pulsar tenant rather than the default one for each Milvus instance before you start them. However, if you do not want Pulsar multi-tenancy, you are advised to change msgChannel.chanNamePrefix.cluster to the different value. + tenant: public + namespace: default # A Pulsar namespace is the administrative unit nomenclature within a tenant. + requestTimeout: 60 # pulsar client global request timeout in seconds + enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path. + +# If you want to enable kafka, needs to comment the pulsar configs +# kafka: +# brokerList: +# saslUsername: +# saslPassword: +# saslMechanisms: +# securityProtocol: +# ssl: +# enabled: false # whether to enable ssl mode +# tlsCert: # path to client's public key (PEM) used for authentication +# tlsKey: # path to client's private key (PEM) used for authentication +# tlsCaCert: # file or directory path to CA certificate(s) for verifying the broker's key +# tlsKeyPassword: # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any +# readTimeout: 10 + +rocksmq: + # Prefix of the key to where Milvus stores data in RocksMQ. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + # Set an easy-to-identify root key prefix for Milvus if etcd service already exists. + path: /var/lib/milvus/rdb_data + lrucacheratio: 0.06 # rocksdb cache memory ratio + rocksmqPageSize: 67108864 # The maximum size of messages in each page in RocksMQ. Messages in RocksMQ are checked and cleared (when expired) in batch based on this parameters. Unit: Byte. + retentionTimeInMinutes: 4320 # The maximum retention time of acked messages in RocksMQ. Acked messages in RocksMQ are retained for the specified period of time and then cleared. Unit: Minute. + retentionSizeInMB: 8192 # The maximum retention size of acked messages of each topic in RocksMQ. Acked messages in each topic are cleared if their size exceed this parameter. Unit: MB. + compactionInterval: 86400 # Time interval to trigger rocksdb compaction to remove deleted data. Unit: Second + compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level. + +# natsmq configuration. +# more detail: https://docs.nats.io/running-a-nats-service/configuration +natsmq: + server: + port: 4222 # Listening port of the NATS server. + storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats + maxFileStore: 17179869184 # Maximum size of the 'file' storage + maxPayload: 8388608 # Maximum number of bytes in a message payload + maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections + initializeTimeout: 4000 # waiting for initialization of natsmq finished + monitor: + trace: false # If true enable protocol trace log messages + debug: false # If true enable debug log messages + logTime: true # If set to false, log without timestamps. + logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path + logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one + retention: + maxAge: 4320 # Maximum age of any message in the P-channel + maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size + maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit + +# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests +rootCoord: + dmlChannelNum: 16 # The number of DML-Channels to create at the root coord startup. + # The maximum number of partitions in each collection. + # New partitions cannot be created if this parameter is set as 0 or 1. + # Range: [0, INT64MAX] + maxPartitionNum: 1024 + # The minimum row count of a segment required for creating index. + # Segments with smaller size than this parameter will not be indexed, and will be searched with brute force. + minSegmentSizeToEnableIndex: 1024 + enableActiveStandby: false + maxDatabaseNum: 64 # Maximum number of database + maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + ip: # TCP/IP address of rootCoord. If not specified, use the first unicastable address + port: 53100 # TCP port of rootCoord + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the rootCoord can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the rootCoord can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on rootCoord can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on rootCoord can receive, unit: byte + +# Related configuration of proxy, used to validate client requests and reduce the returned results. +proxy: + timeTickInterval: 200 # The interval at which proxy synchronizes the time tick, unit: ms. + healthCheckTimeout: 3000 # ms, the interval that to do component healthy check + msgStream: + timeTick: + bufSize: 512 # The maximum number of messages can be buffered in the timeTick message stream of the proxy when producing messages. + maxNameLength: 255 # The maximum length of the name or alias that can be created in Milvus, including the collection name, collection alias, partition name, and field name. + maxFieldNum: 64 # The maximum number of field can be created when creating in a collection. It is strongly DISCOURAGED to set maxFieldNum >= 64. + maxVectorFieldNum: 4 # The maximum number of vector fields that can be specified in a collection. Value range: [1, 10]. + maxShardNum: 16 # The maximum number of shards can be created when creating in a collection. + maxDimension: 32768 # The maximum number of dimensions of a vector can have when creating in a collection. + # Whether to produce gin logs.\n + # please adjust in embedded Milvus: false + ginLogging: true + ginLogSkipPaths: / # skip url path for gin log + maxTaskNum: 1024 # The maximum number of tasks in the task queue of the proxy. + mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection + accessLog: + enable: false # Whether to enable the access log feature. + minioEnable: false # Whether to upload local access log files to MinIO. This parameter can be specified when proxy.accessLog.filename is not empty. + localPath: /tmp/milvus_access # The local folder path where the access log file is stored. This parameter can be specified when proxy.accessLog.filename is not empty. + filename: # The name of the access log file. If you leave this parameter empty, access logs will be printed to stdout. + maxSize: 64 # The maximum size allowed for a single access log file. If the log file size reaches this limit, a rotation process will be triggered. This process seals the current access log file, creates a new log file, and clears the contents of the original log file. Unit: MB. + rotatedTime: 0 # The maximum time interval allowed for rotating a single access log file. Upon reaching the specified time interval, a rotation process is triggered, resulting in the creation of a new access log file and sealing of the previous one. Unit: seconds + remotePath: access_log/ # The path of the object storage for uploading access log files. + remoteMaxTime: 0 # The time interval allowed for uploading access log files. If the upload time of a log file exceeds this interval, the file will be deleted. Setting the value to 0 disables this feature. + formatters: + base: + format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]" + query: + format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]" + methods: "Query,Search,Delete" + cacheSize: 0 # Size of log of write cache, in byte. (Close write cache if size was 0) + cacheFlushInterval: 3 # time interval of auto flush write cache, in seconds. (Close auto flush if interval was 0) + connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info + connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds + maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos + gracefulStopTimeout: 30 # seconds. force stop node without graceful stop + slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds. + queryNodePooling: + size: 10 # the size for shardleader(querynode) client pool + http: + enabled: true # Whether to enable the http server + debug_mode: false # Whether to enable http server debug mode + port: # high-level restful api + acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64 + enablePprof: true # Whether to enable pprof middleware on the metrics port + ip: # TCP/IP address of proxy. If not specified, use the first unicastable address + port: 19530 # TCP port of proxy + internalPort: 19529 + grpc: + serverMaxSendSize: 268435456 # The maximum size of each RPC request that the proxy can send, unit: byte + serverMaxRecvSize: 67108864 # The maximum size of each RPC request that the proxy can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on proxy can send, unit: byte + clientMaxRecvSize: 67108864 # The maximum size of each RPC request that the clients on proxy can receive, unit: byte + +# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments. +queryCoord: + taskMergeCap: 1 + taskExecutionCap: 256 + # Switch value to control if to automatically replace a growing segment with the corresponding indexed sealed segment when the growing segment reaches the sealing threshold. + # If this parameter is set false, Milvus simply searches the growing segments with brute force. + autoHandoff: true + autoBalance: true # Switch value to control if to automatically balance the memory usage among query nodes by distributing segment loading and releasing operations evenly. + autoBalanceChannel: true # Enable auto balance channel + balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes + globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes + scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance + reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance + overloadedMemoryThresholdPercentage: 90 # The threshold of memory usage (in percentage) in a query node to trigger the sealed segment balancing. + balanceIntervalSeconds: 60 # The interval at which query coord balances the memory usage among query nodes. + memoryUsageMaxDifferencePercentage: 30 # The threshold of memory usage difference (in percentage) between any two query nodes to trigger the sealed segment balancing. + rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes + segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes + globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes + segmentCountMaxSteps: 50 # segment count based plan generator max steps + rowCountMaxSteps: 50 # segment count based plan generator max steps + randomMaxSteps: 10 # segment count based plan generator max steps + growingRowCountWeight: 4 # the memory weight of growing segment row count + delegatorMemoryOverloadFactor: 0.1 # the factor of delegator overloaded memory + balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed + checkSegmentInterval: 1000 + checkChannelInterval: 1000 + checkBalanceInterval: 10000 + checkIndexInterval: 10000 + channelTaskTimeout: 60000 # 1 minute + segmentTaskTimeout: 120000 # 2 minute + distPullInterval: 500 + heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available + loadTimeoutSeconds: 600 + distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds + heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds + checkHandoffInterval: 5000 + enableActiveStandby: false + checkInterval: 1000 + checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node + checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node + brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout + collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it + observerTaskParallel: 16 # the parallel observer dispatcher task number + checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config + checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + enableStoppingBalance: true # whether enable stopping balance + channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode + collectionObserverInterval: 200 # the interval of collection observer + checkExecutedFlagInterval: 100 # the interval of check executed flag to force to pull dist + updateCollectionLoadStatusInterval: 5 # 5m, max interval for updating collection loaded status + cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds + ip: # TCP/IP address of queryCoord. If not specified, use the first unicastable address + port: 19531 # TCP port of queryCoord + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the queryCoord can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the queryCoord can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on queryCoord can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on queryCoord can receive, unit: byte + +# Related configuration of queryNode, used to run hybrid search between vector and scalar data. +queryNode: + stats: + publishInterval: 1000 # The interval that query node publishes the node statistics information, including segment status, cpu usage, memory usage, health status, etc. Unit: ms. + segcore: + knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]). + chunkRows: 128 # Row count by which Segcore divides a segment into chunks. + interimIndex: + # Whether to create a temporary index for growing segments and sealed segments not yet indexed, improving search performance. + # Milvus will eventually seals and indexes all segments, but enabling this optimizes search performance for immediate queries following data insertion. + # This defaults to true, indicating that Milvus creates temporary index for growing segments and the sealed segments that are not indexed upon searches. + enableIndex: true + nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8 + nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist + memExpansionRate: 1.15 # extra memory needed by building interim index + buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num + knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic + loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments + enableDisk: false # enable querynode load disk index, and search on disk index + maxDiskUsagePercentage: 95 + cache: + enabled: true + memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024 + readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed` + # options: async, sync, disable. + # Specifies the necessity for warming up the chunk cache. + # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the + # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency + # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage; + # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query. + warmup: disable + mmap: + mmapEnabled: false # Enable mmap for loading data + growingMmapEnabled: false # Enable mmap for using in growing raw data + fixedFileSizeForMmapAlloc: 1 # tmp file size for mmap chunk manager + maxDiskUsagePercentageForMmapAlloc: 50 # disk percentage used in mmap chunk manager + lazyload: + enabled: false # Enable lazyload for loading data + waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve + requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default + requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default + maxRetryTimes: 1 # max retry times for lazy load, 1 by default + maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default + grouping: + enabled: true + maxNQ: 1000 + topKMergeRatio: 20 + scheduler: + receiveChanSize: 10240 + unsolvedQueueSize: 10240 + # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task). + # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio. + # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2. + # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100. + # (0, 100] + maxReadConcurrentRatio: 1 + cpuRatio: 10 # ratio used to estimate read task cpu usage. + maxTimestampLag: 86400 + scheduleReadPolicy: + # fifo: A FIFO queue support the schedule. + # user-task-polling: + # The user's tasks will be polled one by one and scheduled. + # Scheduling is fair on task granularity. + # The policy is based on the username for authentication. + # And an empty username is considered the same user. + # When there are no multi-users, the policy decay into FIFO" + name: fifo + taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty + enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other) + maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler + dataSync: + flowGraph: + maxQueueLength: 16 # The maximum size of task queue cache in flow graph in query node. + maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph + enableSegmentPrune: false # use partition stats to prune data in search/query on shard delegator + bloomFilterApplyParallelFactor: 4 # parallel factor when to apply pk to bloom filter, default to 4*CPU_CORE_NUM + queryStreamBatchSize: 4194304 # return batch size of stream query + workerPooling: + size: 10 # the size for worker querynode client pool + ip: # TCP/IP address of queryNode. If not specified, use the first unicastable address + port: 21123 # TCP port of queryNode + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the queryNode can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the queryNode can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on queryNode can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on queryNode can receive, unit: byte + +indexCoord: + bindIndexNodeMode: + enable: false + address: localhost:22930 + withCred: false + nodeID: 0 + segment: + minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed + +indexNode: + scheduler: + buildParallel: 1 + enableDisk: true # enable index node build disk vector index + maxDiskUsagePercentage: 95 + ip: # TCP/IP address of indexNode. If not specified, use the first unicastable address + port: 21121 # TCP port of indexNode + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the indexNode can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the indexNode can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on indexNode can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on indexNode can receive, unit: byte + +dataCoord: + channel: + watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer. + balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch + legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels + balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing + balanceInterval: 360 # The interval with which the channel manager check dml channel balance status + checkInterval: 1 # The interval in seconds with which the channel manager advances channel states + notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds). + segment: + maxSize: 1024 # The maximum size of a segment, unit: MB. datacoord.segment.maxSize and datacoord.segment.sealProportion together determine if a segment can be sealed. + diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index + sealProportion: 0.12 # The minimum proportion to datacoord.segment.maxSize to seal a segment. datacoord.segment.maxSize and datacoord.segment.sealProportion together determine if a segment can be sealed. + assignmentExpiration: 2000 # Expiration time of the segment assignment, unit: ms + allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart + maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60 + # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than + # minSizeFromIdleToSealed, Milvus will automatically seal it. + # The max idle time of segment in seconds, 10*60. + maxIdleTime: 600 + minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed. + # The max number of binlog file for one segment, the segment will be sealed if + # the number of binlog file reaches to max value. + maxBinlogFileNumber: 32 + smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than + # (smallProportion * segment max # of rows). + # A compaction will happen on small segments if the segment after compaction will have + compactableProportion: 0.85 + # over (compactableProportion * segment max # of rows) rows. + # MUST BE GREATER THAN OR EQUAL TO !!! + # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%. + expansionRate: 1.25 + sealPolicy: + channel: + # The size threshold in MB, if the total size of growing segments of each shard + # exceeds this threshold, the largest growing segment will be sealed. + growingSegmentsMemSize: 4096 + autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version + segmentFlushInterval: 2 # the minimal interval duration(unit: Seconds) between flushing operation on same segment + # Switch value to control if to enable segment compaction. + # Compaction merges small-size segments into a large segment, and clears the entities deleted beyond the rentention duration of Time Travel. + enableCompaction: true + compaction: + # Switch value to control if to enable automatic segment compaction during which data coord locates and merges compactable segments in the background. + # This configuration takes effect only when dataCoord.enableCompaction is set as true. + enableAutoCompaction: true + indexBasedCompaction: true + rpcTimeout: 10 + maxParallelTaskNum: 10 + workerMaxParallelTaskNum: 2 + clustering: + enable: true # Enable clustering compaction + autoEnable: false # Enable auto clustering compaction + triggerInterval: 600 # clustering compaction trigger interval in seconds + minInterval: 3600 # The minimum interval between clustering compaction executions of one collection, to avoid redundant compaction + maxInterval: 259200 # If a collection haven't been clustering compacted for longer than maxInterval, force compact + newDataSizeThreshold: 512m # If new data size is large than newDataSizeThreshold, execute clustering compaction + preferSegmentSizeRatio: 0.8 + maxSegmentSizeRatio: 1 + maxTrainSizeRatio: 0.8 # max data size ratio in Kmeans train, if larger than it, will down sampling to meet this limit + maxCentroidsNum: 10240 # maximum centroids number in Kmeans train + minCentroidsNum: 16 # minimum centroids number in Kmeans train + minClusterSizeRatio: 0.01 # minimum cluster size / avg size in Kmeans train + maxClusterSizeRatio: 10 # maximum cluster size / avg size in Kmeans train + maxClusterSize: 5g # maximum cluster size in Kmeans train + levelzero: + forceTrigger: + minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB + maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB + deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction + deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30 + syncSegmentsInterval: 300 # The time interval for regularly syncing segments + enableGarbageCollection: true # Switch value to control if to enable garbage collection to clear the discarded data in MinIO or S3 service. + gc: + interval: 3600 # The interval at which data coord performs garbage collection, unit: second. + missingTolerance: 86400 # The retention duration of the unrecorded binary log (binlog) files. Setting a reasonably large value for this parameter avoids erroneously deleting the newly created binlog files that lack metadata. Unit: second. + dropTolerance: 10800 # The retention duration of the binlog files of the deleted segments before they are cleared, unit: second. + removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects + scanInterval: 168 # orphan file (file on oss but has not been registered on meta) on object storage garbage collection scanning interval in hours + enableActiveStandby: false + brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout + autoBalance: true # Enable auto balance + checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config + import: + filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task. + taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state. + maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask). + scheduleInterval: 2 # The interval for scheduling import, measured in seconds. + checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker. + checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker. + maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request. + waitForIndex: true # Indicates whether the import operation waits for the completion of index building. + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + slot: + clusteringCompactionUsage: 16 # slot usage of clustering compaction job. + mixCompactionUsage: 8 # slot usage of mix compaction job. + l0DeleteCompactionUsage: 8 # slot usage of l0 compaction job. + ip: # TCP/IP address of dataCoord. If not specified, use the first unicastable address + port: 13333 # TCP port of dataCoord + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the dataCoord can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the dataCoord can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on dataCoord can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on dataCoord can receive, unit: byte + +dataNode: + dataSync: + flowGraph: + maxQueueLength: 16 # Maximum length of task queue in flowgraph + maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph + maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally + skipMode: + enable: true # Support skip some timetick message to reduce CPU usage + skipNum: 4 # Consume one for every n records skipped + coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds + segment: + # The maximum size of each binlog file in a segment buffered in memory. Binlog files whose size exceeds this value are then flushed to MinIO or S3 service. + # Unit: Byte + # Setting this parameter too small causes the system to store a small amount of data too frequently. Setting it too large increases the system's demand for memory. + insertBufSize: 16777216 + deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB + syncPeriod: 600 # The period to sync segments if buffer is not empty. + memory: + forceSyncEnable: true # Set true to force sync if memory usage is too high + forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced. + checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds + forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced. + timetick: + byRPC: true + interval: 500 + channel: + # specify the size of global work pool of all channels + # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing + # suggest to set it bigger on large collection numbers to avoid blocking + workPoolSize: -1 + # specify the size of global work pool for channel checkpoint updating + # if this parameter <= 0, will set it as 10 + updateChannelCheckpointMaxParallel: 10 + updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel + updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call + maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC. + channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates. + import: + maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode. + maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files. + readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import. + compaction: + levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode + levelZeroMaxBatchSize: -1 # Max batch size refers to the max number of L1/L2 segments in a batch when executing L0 compaction. Default to -1, any value that is less than 1 means no limit. Valid range: >= 1. + gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop + slot: + slotCap: 16 # The maximum number of tasks(e.g. compaction, importing) allowed to run concurrently on a datanode + clusteringCompaction: + memoryBufferRatio: 0.1 # The ratio of memory buffer of clustering compaction. Data larger than threshold will be flushed to storage. + workPoolSize: 8 # worker pool size for one clustering compaction job. + ip: # TCP/IP address of dataNode. If not specified, use the first unicastable address + port: 21124 # TCP port of dataNode + grpc: + serverMaxSendSize: 536870912 # The maximum size of each RPC request that the dataNode can send, unit: byte + serverMaxRecvSize: 268435456 # The maximum size of each RPC request that the dataNode can receive, unit: byte + clientMaxSendSize: 268435456 # The maximum size of each RPC request that the clients on dataNode can send, unit: byte + clientMaxRecvSize: 536870912 # The maximum size of each RPC request that the clients on dataNode can receive, unit: byte + +# This topic introduces the message channel-related configurations of Milvus. +msgChannel: + chanNamePrefix: + # Root name prefix of the channel when a message channel is created. + # It is recommended to change this parameter before starting Milvus for the first time. + # To share a Pulsar instance among multiple Milvus instances, consider changing this to a name rather than the default one for each Milvus instance before you start them. + cluster: by-dev + # Sub-name prefix of the message channel where the root coord publishes time tick messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.rootCoordTimeTick} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + rootCoordTimeTick: rootcoord-timetick + # Sub-name prefix of the message channel where the root coord publishes its own statistics messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.rootCoordStatistics} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + rootCoordStatistics: rootcoord-statistics + # Sub-name prefix of the message channel where the root coord publishes Data Manipulation Language (DML) messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.rootCoordDml} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + rootCoordDml: rootcoord-dml + replicateMsg: replicate-msg + # Sub-name prefix of the message channel where the query node publishes time tick messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.queryTimeTick} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + queryTimeTick: queryTimeTick + # Sub-name prefix of the message channel where the data coord publishes time tick messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.dataCoordTimeTick} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + dataCoordTimeTick: datacoord-timetick-channel + # Sub-name prefix of the message channel where the data coord publishes segment information messages. + # The complete channel name prefix is ${msgChannel.chanNamePrefix.cluster}-${msgChannel.chanNamePrefix.dataCoordSegmentInfo} + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + dataCoordSegmentInfo: segment-info-channel + subNamePrefix: + # Subscription name prefix of the data coord. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + dataCoordSubNamePrefix: dataCoord + # Subscription name prefix of the data node. + # Caution: Changing this parameter after using Milvus for a period of time will affect your access to old data. + # It is recommended to change this parameter before starting Milvus for the first time. + dataNodeSubNamePrefix: dataNode + +# Configures the system log output. +log: + # Milvus log level. Option: debug, info, warn, error, panic, and fatal. + # It is recommended to use debug level under test and development environments, and info level in production environment. + level: info + file: + # Root path to the log files. + # The default value is set empty, indicating to output log files to standard output (stdout) and standard error (stderr). + # If this parameter is set to a valid local path, Milvus writes and stores log files in this path. + # Set this parameter as the path that you have permission to write. + rootPath: + maxSize: 300 # The maximum size of a log file, unit: MB. + maxAge: 10 # The maximum retention time before a log file is automatically cleared, unit: day. The minimum value is 1. + maxBackups: 20 # The maximum number of log files to back up, unit: day. The minimum value is 1. + format: text # Milvus log format. Option: text and JSON + stdout: true # Stdout enable or not + +grpc: + log: + level: WARNING + gracefulStopTimeout: 10 # second, time to wait graceful stop finish + client: + compressionEnabled: false + dialTimeout: 200 + keepAliveTime: 10000 + keepAliveTimeout: 20000 + maxMaxAttempts: 10 + initialBackoff: 0.2 + maxBackoff: 10 + minResetInterval: 1000 + maxCancelError: 32 + minSessionCheckInterval: 200 + +# Configure the proxy tls enable. +tls: + serverPemPath: configs/cert/server.pem + serverKeyPath: configs/cert/server.key + caPemPath: configs/cert/ca.pem + +common: + defaultPartitionName: _default # Name of the default partition when a collection is created + defaultIndexName: _default_idx # Name of the index when it is created with name unspecified + entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire + indexSliceSize: 16 # Index slice size in MB + threadCoreCoefficient: + highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool + middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool + lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool + buildIndexThreadPoolRatio: 0.75 + DiskIndex: + MaxDegree: 56 + SearchListSize: 100 + PQCodeBudgetGBRatio: 0.125 + BuildNumThreadsRatio: 1 + SearchCacheBudgetGBRatio: 0.1 + LoadNumThreadRatio: 8 + BeamWidthRatio: 4 + gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency. + gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time. + storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead + # Default value: auto + # Valid values: [auto, avx512, avx2, avx, sse4_2] + # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building. + simdType: auto + security: + authorizationEnabled: false + # The superusers will ignore some system check processes, + # like the old password verification when updating the credential + superUsers: + defaultRootPassword: Milvus # default password for root user + tlsMode: 0 + session: + ttl: 30 # ttl value when session granting a lease to register service + retryTimes: 30 # retry times when session sending etcd requests + locks: + metrics: + enable: false # whether gather statistics for metrics locks + threshold: + info: 500 # minimum milliseconds for printing durations in info level + warn: 1000 # minimum milliseconds for printing durations in warn level + storage: + scheme: s3 + enablev2: false + # Whether to disable the internal time messaging mechanism for the system. + # If disabled (set to false), the system will not allow DML operations, including insertion, deletion, queries, and searches. + # This helps Milvus-CDC synchronize incremental data + ttMsgEnabled: true + traceLogMode: 0 # trace request info + bloomFilterSize: 100000 # bloom filter initial size + maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter + bloomFilterType: BasicBloomFilter # bloom filter type, support BasicBloomFilter and BlockedBloomFilter + bloomFilterApplyBatchSize: 1000 # batch size when to apply pk to bloom filter + usePartitionKeyAsClusteringKey: false # if true, do clustering compaction and segment prune on partition key field + useVectorAsClusteringKey: false # if true, do clustering compaction and segment prune on vector field + enableVectorClusteringKey: false # if true, enable vector clustering key and vector clustering compaction + +# QuotaConfig, configurations of Milvus quota and limits. +# By default, we enable: +# 1. TT protection; +# 2. Memory protection. +# 3. Disk quota protection. +# You can enable: +# 1. DML throughput limitation; +# 2. DDL, DQL qps/rps limitation; +# 3. DQL Queue length/latency protection; +# 4. DQL result rate protection; +# If necessary, you can also manually force to deny RW requests. +quotaAndLimits: + enabled: true # `true` to enable quota and limits, `false` to disable. + # quotaCenterCollectInterval is the time interval that quotaCenter + # collects metrics from Proxies, Query cluster and Data cluster. + # seconds, (0 ~ 65536) + quotaCenterCollectInterval: 3 + limits: + allocRetryTimes: 15 # retry times when delete alloc forward data from rate limit failed + allocWaitInterval: 1000 # retry wait duration when delete alloc forward data rate failed, in millisecond + complexDeleteLimitEnable: false # whether complex delete check forward data by limiter + maxCollectionNum: 65536 + maxCollectionNumPerDB: 65536 # Maximum number of collections per database. + maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit + maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes + ddl: + enabled: false # Whether DDL request throttling is enabled. + # Maximum number of collection-related DDL requests per second. + # Setting this item to 10 indicates that Milvus processes no more than 10 collection-related DDL requests per second, including collection creation requests, collection drop requests, collection load requests, and collection release requests. + # To use this setting, set quotaAndLimits.ddl.enabled to true at the same time. + collectionRate: -1 + # Maximum number of partition-related DDL requests per second. + # Setting this item to 10 indicates that Milvus processes no more than 10 partition-related requests per second, including partition creation requests, partition drop requests, partition load requests, and partition release requests. + # To use this setting, set quotaAndLimits.ddl.enabled to true at the same time. + partitionRate: -1 + db: + collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection + partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition + indexRate: + enabled: false # Whether index-related request throttling is enabled. + # Maximum number of index-related requests per second. + # Setting this item to 10 indicates that Milvus processes no more than 10 partition-related requests per second, including index creation requests and index drop requests. + # To use this setting, set quotaAndLimits.indexRate.enabled to true at the same time. + max: -1 + db: + max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex + flushRate: + enabled: true # Whether flush request throttling is enabled. + # Maximum number of flush requests per second. + # Setting this item to 10 indicates that Milvus processes no more than 10 flush requests per second. + # To use this setting, set quotaAndLimits.flushRate.enabled to true at the same time. + max: -1 + collection: + max: 0.1 # qps, default no limit, rate for flush at collection level. + db: + max: -1 # qps of db level, default no limit, rate for flush + compactionRate: + enabled: false # Whether manual compaction request throttling is enabled. + # Maximum number of manual-compaction requests per second. + # Setting this item to 10 indicates that Milvus processes no more than 10 manual-compaction requests per second. + # To use this setting, set quotaAndLimits.compaction.enabled to true at the same time. + max: -1 + db: + max: -1 # qps of db level, default no limit, rate for manualCompaction + dml: + enabled: false # Whether DML request throttling is enabled. + insertRate: + # Highest data insertion rate per second. + # Setting this item to 5 indicates that Milvus only allows data insertion at the rate of 5 MB/s. + # To use this setting, set quotaAndLimits.dml.enabled to true at the same time. + max: -1 + db: + max: -1 # MB/s, default no limit + collection: + # Highest data insertion rate per collection per second. + # Setting this item to 5 indicates that Milvus only allows data insertion to any collection at the rate of 5 MB/s. + # To use this setting, set quotaAndLimits.dml.enabled to true at the same time. + max: -1 + partition: + max: -1 # MB/s, default no limit + upsertRate: + max: -1 # MB/s, default no limit + db: + max: -1 # MB/s, default no limit + collection: + max: -1 # MB/s, default no limit + partition: + max: -1 # MB/s, default no limit + deleteRate: + # Highest data deletion rate per second. + # Setting this item to 0.1 indicates that Milvus only allows data deletion at the rate of 0.1 MB/s. + # To use this setting, set quotaAndLimits.dml.enabled to true at the same time. + max: -1 + db: + max: -1 # MB/s, default no limit + collection: + # Highest data deletion rate per second. + # Setting this item to 0.1 indicates that Milvus only allows data deletion from any collection at the rate of 0.1 MB/s. + # To use this setting, set quotaAndLimits.dml.enabled to true at the same time. + max: -1 + partition: + max: -1 # MB/s, default no limit + bulkLoadRate: + max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate + db: + max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate + collection: + max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate + partition: + max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate + dql: + enabled: false # Whether DQL request throttling is enabled. + searchRate: + # Maximum number of vectors to search per second. + # Setting this item to 100 indicates that Milvus only allows searching 100 vectors per second no matter whether these 100 vectors are all in one search or scattered across multiple searches. + # To use this setting, set quotaAndLimits.dql.enabled to true at the same time. + max: -1 + db: + max: -1 # vps (vectors per second), default no limit + collection: + # Maximum number of vectors to search per collection per second. + # Setting this item to 100 indicates that Milvus only allows searching 100 vectors per second per collection no matter whether these 100 vectors are all in one search or scattered across multiple searches. + # To use this setting, set quotaAndLimits.dql.enabled to true at the same time. + max: -1 + partition: + max: -1 # vps (vectors per second), default no limit + queryRate: + # Maximum number of queries per second. + # Setting this item to 100 indicates that Milvus only allows 100 queries per second. + # To use this setting, set quotaAndLimits.dql.enabled to true at the same time. + max: -1 + db: + max: -1 # qps, default no limit + collection: + # Maximum number of queries per collection per second. + # Setting this item to 100 indicates that Milvus only allows 100 queries per collection per second. + # To use this setting, set quotaAndLimits.dql.enabled to true at the same time. + max: -1 + partition: + max: -1 # qps, default no limit + limitWriting: + # forceDeny false means dml requests are allowed (except for some + # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests. + forceDeny: false + ttProtection: + enabled: false + # maxTimeTickDelay indicates the backpressure for DML Operations. + # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay, + # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected. + # seconds + maxTimeTickDelay: 300 + memProtection: + # When memory usage > memoryHighWaterLevel, all dml requests would be rejected; + # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate; + # When memory usage < memoryLowWaterLevel, no action. + enabled: true + dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes + dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes + queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes + queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes + growingSegmentsSizeProtection: + # No action will be taken if the growing segments size is less than the low watermark. + # When the growing segments size exceeds the low watermark, the dml rate will be reduced, + # but the rate will not be lower than minRateRatio * dmlRate. + enabled: false + minRateRatio: 0.5 + lowWaterLevel: 0.2 + highWaterLevel: 0.4 + diskProtection: + enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected; + diskQuota: -1 # MB, (0, +inf), default no limit + diskQuotaPerDB: -1 # MB, (0, +inf), default no limit + diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit + diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit + l0SegmentsRowCountProtection: + enabled: false # switch to enable l0 segment row count quota + lowWaterLevel: 32768 # l0 segment row count quota, low water level + highWaterLevel: 65536 # l0 segment row count quota, low water level + limitReading: + # forceDeny false means dql requests are allowed (except for some + # specific conditions, such as collection has been dropped), true means always reject all dql requests. + forceDeny: false + queueProtection: + enabled: false + # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path. + # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off + # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1. + # int, default no limit + nqInQueueThreshold: -1 + # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path. + # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off + # until the latency of queuing no longer exceeds queueLatencyThreshold. + # The latency here refers to the averaged latency over a period of time. + # milliseconds, default no limit + queueLatencyThreshold: -1 + resultProtection: + enabled: false + # maxReadResultRate indicated that the system was under backpressure for Search/Query path. + # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off + # until the read result rate no longer exceeds maxReadResultRate. + # MB/s, default no limit + maxReadResultRate: -1 + maxReadResultRatePerDB: -1 + maxReadResultRatePerCollection: -1 + # colOffSpeed is the speed of search&query rates cool off. + # (0, 1] + coolOffSpeed: 0.9 + +trace: + # trace exporter type, default is stdout, + # optional values: ['noop','stdout', 'jaeger', 'otlp'] + exporter: noop + # fraction of traceID based sampler, + # optional values: [0, 1] + # Fractions >= 1 will always sample. Fractions < 0 are treated as zero. + sampleFraction: 0 + jaeger: + url: # when exporter is jaeger should set the jaeger's URL + otlp: + endpoint: # example: "127.0.0.1:4317" for grpc, "127.0.0.1:4318" for http + method: # otlp export method, acceptable values: ["grpc", "http"], using "grpc" by default + secure: true + initTimeoutSeconds: 10 # segcore initialization timeout in seconds, preventing otlp grpc hangs forever + +#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation. +#here, you can set the size of the memory occupied by the memory pool, with the unit being MB. +#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize. +#if initMemSize and MaxMemSize both set zero, +#milvus will automatically initialize half of the available GPU memory, +#maxMemSize will the whole available GPU memory. +gpu: + initMemSize: # Gpu Memory Pool init size + maxMemSize: # Gpu Memory Pool Max size diff --git a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py new file mode 100644 index 0000000000..d6aba4c978 --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py @@ -0,0 +1,491 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import io +import json +import msgpack +import os +import tempfile +from pathlib import Path +from typing import List, Optional, Union + +from minio import Minio + +from comps.dataprep.minio.milvus.langchain.config import MINIO_WAREHOUSE_BUCKET +from comps.dataprep.minio.minio_schema import MinioEventNotification +from config import ( + COLLECTION_NAME, + MINIO_ENDPOINT, + MINIO_ACCESS_KEY, + MINIO_SECRET_KEY, + MINIO_SECURE, + MILVUS_HOST, + MILVUS_PORT, + MINIO_DOCUMENT_BUCKET +) +from fastapi import Body, File, Form, HTTPException, UploadFile, Request +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_core.documents import Document +from langchain_milvus.vectorstores import Milvus +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + create_upload_folder, + decode_filename, + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + remove_folder_with_ignore, +) + +logger = CustomLogger("prepare_doc_minio_milvus") +logflag = os.getenv("LOGFLAG", True) + +# workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py +# from utils import document_loader, get_tables_result, parse_html +index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} +partition_field_name = "filename" +upload_folder = "./uploaded_files/" + +minio_client = Minio( + endpoint=MINIO_ENDPOINT, + access_key=MINIO_ACCESS_KEY, + secret_key=MINIO_SECRET_KEY, + secure=MINIO_SECURE) + + +class MosecEmbeddings(OpenAIEmbeddings): + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + _chunk_size = chunk_size or self.chunk_size + batched_embeddings: List[List[float]] = [] + response = self.client.create(input=texts, **self._invocation_params) + if not isinstance(response, dict): + response = response.model_dump() + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + _cached_empty_embedding: Optional[List[float]] = None + + def empty_embedding() -> List[float]: + nonlocal _cached_empty_embedding + if _cached_empty_embedding is None: + average_embedded = self.client.create(input="", **self._invocation_params) + if not isinstance(average_embedded, dict): + average_embedded = average_embedded.model_dump() + _cached_empty_embedding = average_embedded["data"][0]["embedding"] + return _cached_empty_embedding + + return [e if e is not None else empty_embedding() for e in batched_embeddings] + + +def ingest_chunks_to_milvus(file_name: str, chunks: List): + if logflag: + logger.info(f"[ ingest chunks ] file name: {file_name}") + + # insert documents to Milvus + insert_docs = [] + for chunk in chunks: + insert_docs.append(Document(page_content=chunk, metadata={partition_field_name: file_name})) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + + for i in range(0, num_chunks, batch_size): + if logflag: + logger.info(f"[ ingest chunks ] Current batch: {i}") + batch_docs = insert_docs[i: i + batch_size] + + try: + _ = Milvus.from_documents( + batch_docs, + embeddings, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + partition_key_field=partition_field_name, + ) + except Exception as e: + if logflag: + logger.info(f"[ ingest chunks ] fail to ingest chunks into Milvus. error: {e}") + raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.") + + if logflag: + logger.info(f"[ ingest chunks ] Docs ingested file {file_name} to Milvus collection {COLLECTION_NAME}.") + + return True + + +def ingest_data_to_minio(doc_path: DocPath): + """Ingest document to Milvus.""" + path = doc_path.path + file_name = path.split("/")[-1] + if logflag: + logger.info(f"[ ingest data ] Parsing document {path}, file name: {file_name}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + if logflag: + logger.info("[ ingest data ] file content loaded") + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original file.") + + return chunks + + +def search_by_file(collection, file_name): + query = f"{partition_field_name} == '{file_name}'" + results = collection.query( + expr=query, + output_fields=[partition_field_name, "pk"], + ) + if logflag: + logger.info(f"[ search by file ] searched by {file_name}") + logger.info(f"[ search by file ] {len(results)} results: {results}") + return results + + +def search_all(collection): + results = collection.query(expr="pk >= 0", output_fields=[partition_field_name, "pk"]) + if logflag: + logger.info(f"[ search all ] {len(results)} results: {results}") + return results + + +def delete_all_data(my_milvus): + if logflag: + logger.info("[ delete all ] deleting all data in milvus") + if my_milvus.col: + my_milvus.col.drop() + if logflag: + logger.info("[ delete all ] delete success: all data") + + +def delete_by_partition_field(my_milvus, partition_field): + if logflag: + logger.info(f"[ delete partition ] deleting {partition_field_name} {partition_field}") + pks = my_milvus.get_pks(f'{partition_field_name} == "{partition_field}"') + if logflag: + logger.info(f"[ delete partition ] target pks: {pks}") + res = my_milvus.delete(pks) + my_milvus.col.flush() + if logflag: + logger.info(f"[ delete partition ] delete success: {res}") + + +@register_microservice(name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files and link_list: + raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + + for file in files: + encode_file = encode_filename(file.filename) + save_path = f"s3://{MINIO_DOCUMENT_BUCKET}/{encode_file}" + if logflag: + logger.info(f"[ upload ] processing file {save_path}") + + content = await file.read() + file_size = len(content) + file_data = io.BytesIO(content) + + minio_client.put_object( + bucket_name=MINIO_DOCUMENT_BUCKET, + object_name=encode_file, + data=file_data, + length=file_size, + content_type=file.content_type, + metadata={ + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "process_table": process_table, + "table_strategy": table_strategy + }) + + uploaded_files.append(save_path) + if logflag: + logger.info(f"Saved file {save_path} into MinIO") + + results = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(results) + return results + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + + for link in link_list: + encoded_link = encode_filename(link) + if logflag: + logger.info(f"[ upload ] processing link {encoded_link}") + + encode_file = f"{encoded_link}.txt" + content = parse_html([link])[0][0] + file_size = len(content) + file_data = io.BytesIO(content) + + minio_client.put_object( + bucket_name=MINIO_DOCUMENT_BUCKET, + object_name=encode_file, + data=file_data, + length=file_size, + metadata={ + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "process_table": process_table, + "table_strategy": table_strategy + }) + + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +@register_microservice( + name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/minio/document/notification", host="0.0.0.0", port=6010 +) +async def process_documents(event: MinioEventNotification): + # json_data = await request.json() + # print(json.dumps(json_data, indent=2)) + print(event) + if event.EventName == "s3:ObjectCreated:Put": + for record in event.Records: + bucket_name = record.s3.bucket.name + object_name = record.s3.object.key + _, file_extension = os.path.splitext(object_name) + with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_file: + temp_file_path = temp_file.name + minio_client.fget_object(bucket_name, object_name, temp_file_path) + chunks = ingest_data_to_minio(DocPath( + path=temp_file_path, + chunk_size=record.s3.object.userMetadata.chunk_size, + chunk_overlap=record.s3.object.userMetadata.chunk_overlap, + process_table=record.s3.object.userMetadata.process_table, + table_strategy=record.s3.object.userMetadata.table_strategy, + )) + msgpack_data = msgpack.packb(chunks) + buffer = io.BytesIO(msgpack_data) + buffer_size = buffer.getbuffer().nbytes + minio_client.put_object( + MINIO_WAREHOUSE_BUCKET, + object_name=f"metadata/{object_name}.msgpack", + data=buffer, + length=buffer_size, + content_type='application/x-msgpack' + ) + return {"status": 200, "message": "Document processed successfully"} + + +@register_microservice( + name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/minio/metadata/notification", host="0.0.0.0", port=6010 +) +async def process_metadata(event: MinioEventNotification): + print(event) + pass + + +@register_microservice( + name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6010 +) +async def rag_get_file_structure(): + if logflag: + logger.info("[ get ] start to get file structure") + + # define Milvus obj + my_milvus = Milvus( + embedding_function=embeddings, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + index_params=index_params, + auto_id=True, + ) + + # collection does not exist + if not my_milvus.col: + logger.info(f"[ get ] collection {COLLECTION_NAME} does not exist.") + return [] + + # get all files from db + try: + all_data = search_all(my_milvus.col) + except Exception as e: + raise HTTPException(status_code=500, detail="Failed when searching in Milvus db for all files.") + + # return [] if no data in db + if len(all_data) == 0: + return [] + + res_file = [res["filename"] for res in all_data] + unique_list = list(set(res_file)) + if logflag: + logger.info(f"[ get ] unique list from db: {unique_list}") + + # construct result file list in format + file_list = [] + for file_name in unique_list: + file_dict = { + "name": decode_filename(file_name), + "id": decode_filename(file_name), + "type": "File", + "parent": "", + } + file_list.append(file_dict) + + if logflag: + logger.info(f"[ get ] final file list: {file_list}") + return file_list + + +@register_microservice( + name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6010 +) +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - file/link path (e.g. /path/to/file.txt) + - "all": delete all files uploaded + """ + if logflag: + logger.info(file_path) + + # define Milvus obj + my_milvus = Milvus( + embedding_function=embeddings, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + index_params=index_params, + auto_id=True, + ) + + # delete all uploaded files + if file_path == "all": + if logflag: + logger.info("[ delete ] deleting all files") + + delete_all_data(my_milvus) + + # delete files on local disk + try: + remove_folder_with_ignore(upload_folder) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to delete {upload_folder}.") + raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.") + + if logflag: + logger.info("[ delete ] successfully delete all files.") + + create_upload_folder(upload_folder) + if logflag: + logger.info("[ delete ] new upload folder created.") + return {"status": True} + + encode_file_name = encode_filename(file_path) + delete_path = Path(upload_folder + "/" + encode_file_name) + if logflag: + logger.info(f"[delete] delete_path: {delete_path}") + + # partially delete files + if delete_path.exists(): + + # TODO: check existence before delete + + # delete file + if delete_path.is_file(): + if logflag: + logger.info(f"[delete] deleting file {encode_file_name}") + try: + delete_by_partition_field(my_milvus, encode_file_name) + except Exception as e: + if logflag: + logger.info(f"[delete] fail to delete file {delete_path}: {e}") + return {"status": False} + delete_path.unlink() + if logflag: + logger.info(f"[delete] file {file_path} deleted") + return {"status": True} + + # delete folder + else: + if logflag: + logger.info(f"[delete] delete folder {file_path} is not supported for now.") + raise HTTPException(status_code=404, detail=f"Delete folder {file_path} is not supported for now.") + else: + raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + + +if __name__ == "__main__": + create_upload_folder(upload_folder) + print(f"upload folder {upload_folder} created at {Path(upload_folder).absolute()}") + + # # Create vectorstore + # if MOSEC_EMBEDDING_ENDPOINT: + # # create embeddings using MOSEC endpoint service + # if logflag: + # logger.info( + # f"[ prepare_doc_minio_milvus ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + # ) + # embeddings = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) + # elif TEI_EMBEDDING_ENDPOINT: + # # create embeddings using TEI endpoint service + # if logflag: + # logger.info(f"[ prepare_doc_minio_milvus ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + # embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) + # else: + # # create embeddings using local embedding model + # if logflag: + # logger.info(f"[ prepare_doc_minio_milvus ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") + # embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL) + + opea_microservices["opea_service@prepare_doc_minio_milvus"].start() + print("DOCPREP Server Started") diff --git a/comps/dataprep/minio/milvus/langchain/requirements.txt b/comps/dataprep/minio/milvus/langchain/requirements.txt new file mode 100644 index 0000000000..5bae5d2525 --- /dev/null +++ b/comps/dataprep/minio/milvus/langchain/requirements.txt @@ -0,0 +1,31 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-text-splitters +langchain_milvus +markdown +msgpack +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +tiktoken +unstructured[all-docs]==0.15.7 +uvicorn diff --git a/comps/dataprep/minio/minio_schema.py b/comps/dataprep/minio/minio_schema.py new file mode 100644 index 0000000000..97f0c632cc --- /dev/null +++ b/comps/dataprep/minio/minio_schema.py @@ -0,0 +1,76 @@ +from pydantic import BaseModel, Field +from typing import List, Optional +from datetime import datetime + +class UserIdentity(BaseModel): + principalId: str + +class RequestParameters(BaseModel): + principalId: str + region: str + sourceIPAddress: str + +class ResponseElements(BaseModel): + x_amz_id_2: str = Field(..., alias="x-amz-id-2") + x_amz_request_id: str = Field(..., alias="x-amz-request-id") + x_minio_deployment_id: str = Field(..., alias="x-minio-deployment-id") + x_minio_origin_endpoint: str = Field(..., alias="x-minio-origin-endpoint") + +class BucketOwnerIdentity(BaseModel): + principalId: str + +class Bucket(BaseModel): + name: str + ownerIdentity: BucketOwnerIdentity + arn: str + +class ObjectUserMetadata(BaseModel): + content_type: str = Field(..., alias="content-type") + chunk_overlap: Optional[int] = Field(100, alias="X-Amz-Meta-Chunk_overlap") + chunk_size: Optional[int] = Field(1500, alias="X-Amz-Meta-Chunk_size") + process_table: Optional[bool] = Field(False, alias="X-Amz-Meta-Process_table") + table_strategy: Optional[str] = Field("fast", alias="X-Amz-Meta-Table_strategy") + + class Config: + populate_by_name = True + allow_population_by_field_name = True + +class S3Object(BaseModel): + key: str + size: int + eTag: str + contentType: str + userMetadata: ObjectUserMetadata + sequencer: str + +class S3(BaseModel): + s3SchemaVersion: str + configurationId: str + bucket: Bucket + object: S3Object + +class Source(BaseModel): + host: str + port: str + userAgent: str + +class Record(BaseModel): + eventVersion: str + eventSource: str + awsRegion: str + eventTime: datetime + eventName: str + userIdentity: UserIdentity + requestParameters: RequestParameters + responseElements: ResponseElements + s3: S3 + source: Source + +class MinioEventNotification(BaseModel): + EventName: str + Key: str + Records: List[Record] + + class Config: + from_attributes = True + populate_by_name = True \ No newline at end of file From a299782dd13e242ef3c9ae20cb1a823541b3348f Mon Sep 17 00:00:00 2001 From: dil Date: Fri, 1 Nov 2024 00:27:10 +0000 Subject: [PATCH 02/28] Initial version of MinIO event based dataprep for Milvus --- .../minio/milvus/langchain/Dockerfile | 4 +- .../dataprep/minio/milvus/langchain/config.py | 2 +- .../minio/milvus/langchain/docker-compose.yml | 61 ++++++ .../milvus/langchain/prepare_doc_milvus.py | 185 ++++++++++-------- .../minio/milvus/langchain/requirements.txt | 1 + comps/dataprep/minio/minio_schema.py | 17 +- 6 files changed, 182 insertions(+), 88 deletions(-) diff --git a/comps/dataprep/minio/milvus/langchain/Dockerfile b/comps/dataprep/minio/milvus/langchain/Dockerfile index 8c68b10edf..5766bb24de 100644 --- a/comps/dataprep/minio/milvus/langchain/Dockerfile +++ b/comps/dataprep/minio/milvus/langchain/Dockerfile @@ -22,9 +22,9 @@ USER user COPY comps /home/user/comps -RUN pip install --no-cache-dir --upgrade pip setuptools && \ +RUN pip install --upgrade pip setuptools && \ if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ - pip install --no-cache-dir -r /home/user/comps/dataprep/minio/milvus/langchain/requirements.txt + pip install -r /home/user/comps/dataprep/minio/milvus/langchain/requirements.txt ENV PYTHONPATH=$PYTHONPATH:/home/user diff --git a/comps/dataprep/minio/milvus/langchain/config.py b/comps/dataprep/minio/milvus/langchain/config.py index ef2a2366ea..205b2ecd5a 100644 --- a/comps/dataprep/minio/milvus/langchain/config.py +++ b/comps/dataprep/minio/milvus/langchain/config.py @@ -14,7 +14,7 @@ # MOSEC configuration MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5") MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "") -MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "localhost:9000") +MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") MINIO_SECRET_KEY=os.environ.get("MINIO_SECRET_KEY", "minioadmin") MINIO_SECURE=os.environ.get("MINIO_SECURE", "False").lower() == 'true' diff --git a/comps/dataprep/minio/milvus/langchain/docker-compose.yml b/comps/dataprep/minio/milvus/langchain/docker-compose.yml index 8dde3fed06..6c50fe491d 100644 --- a/comps/dataprep/minio/milvus/langchain/docker-compose.yml +++ b/comps/dataprep/minio/milvus/langchain/docker-compose.yml @@ -39,6 +39,49 @@ services: timeout: 20s retries: 3 + minio-setup: + container_name: minio-mc + image: minio/mc + depends_on: + minio: + condition: service_healthy + dataprep-milvus-service: + condition: service_started + + environment: + MINIO_URL: ${MINIO_URL:-http://minio:9000} + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + DOCUMENT_WEBHOOK_URL: ${DOCUMENT_WEBHOOK_URL:-http://dataprep-milvus-service:6010/v1/minio/document/notification} + METADATA_WEBHOOK_URL: ${METADATA_WEBHOOK_URL:-http://dataprep-milvus-service:6010/v1/minio/metadata/notification} + entrypoint: + - /bin/sh + - -c + - | + set -x; + mc alias set myminio $${MINIO_URL} $${MINIO_ROOT_USER} $${MINIO_ROOT_PASSWORD}; + + # Create buckets + mc mb --ignore-existing myminio/document; + mc mb --ignore-existing myminio/warehouse; + echo 'Created Buckets'; + # Configure webhooks + mc admin config set myminio notify_webhook:document_notify endpoint=\"$${DOCUMENT_WEBHOOK_URL}\"; + mc admin config set myminio notify_webhook:metadata_notify endpoint=\"$${METADATA_WEBHOOK_URL}\"; + echo 'Webhooks setup successfully'; + + # Restart MinIO to apply webhook configurations + mc admin service restart myminio --quiet --json; + echo 'MinIO Service Restarted' + # Wait for MinIO to come back up + sleep 10; + + # Configure event notifications + mc event add myminio/document arn:minio:sqs::document_notify:webhook --event put,delete; + mc event add myminio/warehouse --prefix metadata --suffix .msgpack arn:minio:sqs::metadata_notify:webhook --event put,delete; + + echo 'MinIO setup completed successfully!'; + standalone: container_name: milvus-standalone image: milvusdb/milvus:v2.4.9 @@ -64,6 +107,24 @@ services: - "etcd" - "minio" + dataprep-milvus-service: + image: opea/dataprep-minio-milvus:0.1 + container_name: dataprep-milvus-server + depends_on: + - standalone + ports: + - "6010:6010" + volumes: + - "./prepare_doc_milvus.py:/home/user/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py" + - "../../minio_schema.py:/home/user/comps/dataprep/minio/minio_schema.py" + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MILVUS_HOST: http://standalone + MILVUS_PORT: 19530 + MINIO_ENDPOINT: minio:9000 + COLLECTION_NAME: ${INDEX_NAME:-milvus_db} + networks: default: name: milvus diff --git a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py index d6aba4c978..76ceb5a2af 100644 --- a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py @@ -8,23 +8,27 @@ from pathlib import Path from typing import List, Optional, Union -from minio import Minio +from minio import Minio, S3Error from comps.dataprep.minio.milvus.langchain.config import MINIO_WAREHOUSE_BUCKET from comps.dataprep.minio.minio_schema import MinioEventNotification from config import ( COLLECTION_NAME, + LOCAL_EMBEDDING_MODEL, MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, MINIO_SECURE, MILVUS_HOST, MILVUS_PORT, - MINIO_DOCUMENT_BUCKET + MINIO_DOCUMENT_BUCKET, + MOSEC_EMBEDDING_ENDPOINT, + MOSEC_EMBEDDING_MODEL, + TEI_EMBEDDING_ENDPOINT, ) from fastapi import Body, File, Form, HTTPException, UploadFile, Request from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings from langchain_core.documents import Document from langchain_milvus.vectorstores import Milvus from langchain_text_splitters import HTMLHeaderTextSplitter @@ -101,11 +105,12 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): batch_docs = insert_docs[i: i + batch_size] try: + logger.info(f"MILVUS HOST IS: {MILVUS_HOST}") _ = Milvus.from_documents( batch_docs, embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, partition_key_field=partition_field_name, ) except Exception as e: @@ -182,13 +187,27 @@ def search_all(collection): return results -def delete_all_data(my_milvus): +def delete_all_data(): if logflag: logger.info("[ delete all ] deleting all data in milvus") - if my_milvus.col: - my_milvus.col.drop() - if logflag: - logger.info("[ delete all ] delete success: all data") + # List and delete all objects + try: + # Generate a list of all objects in the bucket + objects = minio_client.list_objects(MINIO_DOCUMENT_BUCKET, recursive=True) + + # Delete each object + for obj in objects: + minio_client.remove_object(MINIO_DOCUMENT_BUCKET, obj.object_name) + print(f"Deleted {obj.object_name}") + + print("All objects have been deleted from the bucket.") + + except S3Error as e: + print("Error:", e) + # if my_milvus.col: + # my_milvus.col.drop() + # if logflag: + # logger.info("[ delete all ] delete success: all data") def delete_by_partition_field(my_milvus, partition_field): @@ -263,6 +282,7 @@ async def ingest_documents( for link in link_list: encoded_link = encode_filename(link) + if logflag: logger.info(f"[ upload ] processing link {encoded_link}") @@ -322,6 +342,11 @@ async def process_documents(event: MinioEventNotification): length=buffer_size, content_type='application/x-msgpack' ) + if event.EventName == "s3:ObjectRemoved:Delete": + for record in event.Records: + object_name = record.s3.object.key + minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, + object_name=f"metadata/{object_name}.msgpack") return {"status": 200, "message": "Document processed successfully"} @@ -329,8 +354,43 @@ async def process_documents(event: MinioEventNotification): name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/minio/metadata/notification", host="0.0.0.0", port=6010 ) async def process_metadata(event: MinioEventNotification): - print(event) - pass + # json_data = await request.json() + # print(json.dumps(json_data, indent=2)) + if event.EventName == "s3:ObjectCreated:Put": + for record in event.Records: + bucket_name = record.s3.bucket.name + object_name = record.s3.object.key + response = minio_client.get_object(bucket_name, object_name) + msgpack_data = response.read() + response.close() + response.release_conn() + + # Deserialize the MsgPack data back into a list + chunk_list = msgpack.unpackb(msgpack_data) + print(f"Total Chunks are {len(chunk_list)}") + file_name = object_name.split(".msgpack")[0].split("metadata/")[1] + ingest_chunks_to_milvus(file_name, chunk_list) + elif event.EventName == "s3:ObjectRemoved:Delete": + # define Milvus obj + my_milvus = Milvus( + embedding_function=embeddings, + collection_name=COLLECTION_NAME, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, + index_params=index_params, + auto_id=True, + ) + for record in event.Records: + object_name = record.s3.object.key + file_name = object_name.split(".msgpack")[0].split("metadata/")[1] + encode_file_name = encode_filename(file_name) + try: + delete_by_partition_field(my_milvus, encode_file_name) + except Exception as e: + if logflag: + logger.info(f"[delete] fail to delete file {file_name}: {e}") + return {"status": False} + + return {"status": 200, "message": "Metadata processed successfully"} @register_microservice( @@ -344,7 +404,7 @@ async def rag_get_file_structure(): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, index_params=index_params, auto_id=True, ) @@ -398,94 +458,59 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): if logflag: logger.info(file_path) - # define Milvus obj - my_milvus = Milvus( - embedding_function=embeddings, - collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, - index_params=index_params, - auto_id=True, - ) - # delete all uploaded files if file_path == "all": if logflag: logger.info("[ delete ] deleting all files") - delete_all_data(my_milvus) - # delete files on local disk - try: - remove_folder_with_ignore(upload_folder) - except Exception as e: - if logflag: - logger.info(f"[ delete ] {e}. Fail to delete {upload_folder}.") - raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.") + delete_all_data() if logflag: logger.info("[ delete ] successfully delete all files.") - create_upload_folder(upload_folder) - if logflag: - logger.info("[ delete ] new upload folder created.") return {"status": True} encode_file_name = encode_filename(file_path) - delete_path = Path(upload_folder + "/" + encode_file_name) - if logflag: - logger.info(f"[delete] delete_path: {delete_path}") - - # partially delete files - if delete_path.exists(): - # TODO: check existence before delete - - # delete file - if delete_path.is_file(): - if logflag: - logger.info(f"[delete] deleting file {encode_file_name}") - try: - delete_by_partition_field(my_milvus, encode_file_name) - except Exception as e: - if logflag: - logger.info(f"[delete] fail to delete file {delete_path}: {e}") - return {"status": False} - delete_path.unlink() - if logflag: - logger.info(f"[delete] file {file_path} deleted") - return {"status": True} - - # delete folder - else: - if logflag: - logger.info(f"[delete] delete folder {file_path} is not supported for now.") - raise HTTPException(status_code=404, detail=f"Delete folder {file_path} is not supported for now.") - else: - raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + if logflag: + logger.info(f"[delete] deleting file {encode_file_name}") + try: + minio_client.remove_object(MINIO_DOCUMENT_BUCKET, encode_file_name) + except Exception as e: + if logflag: + logger.info(f"[delete] fail to delete file {encode_file_name}: {e}") + return {"status": False} + if logflag: + logger.info(f"[delete] file {file_path} deleted") + return {"status": True} if __name__ == "__main__": create_upload_folder(upload_folder) print(f"upload folder {upload_folder} created at {Path(upload_folder).absolute()}") - # # Create vectorstore - # if MOSEC_EMBEDDING_ENDPOINT: - # # create embeddings using MOSEC endpoint service - # if logflag: - # logger.info( - # f"[ prepare_doc_minio_milvus ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" - # ) - # embeddings = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) - # elif TEI_EMBEDDING_ENDPOINT: - # # create embeddings using TEI endpoint service - # if logflag: - # logger.info(f"[ prepare_doc_minio_milvus ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") - # embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) - # else: - # # create embeddings using local embedding model - # if logflag: - # logger.info(f"[ prepare_doc_minio_milvus ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") - # embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL) + # Create vectorstore + if MOSEC_EMBEDDING_ENDPOINT: + # create embeddings using MOSEC endpoint service + if logflag: + logger.info( + f"[ prepare_doc_minio_milvus ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + ) + embeddings = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) + elif TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + if logflag: + logger.info(f"[ prepare_doc_minio_milvus ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) + else: + # create embeddings using local embedding model + if logflag: + logger.info(f"[ prepare_doc_minio_milvus ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") + embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs = { + 'device': 'cpu', + 'trust_remote_code':True + }) opea_microservices["opea_service@prepare_doc_minio_milvus"].start() print("DOCPREP Server Started") diff --git a/comps/dataprep/minio/milvus/langchain/requirements.txt b/comps/dataprep/minio/milvus/langchain/requirements.txt index 5bae5d2525..03462bcf0b 100644 --- a/comps/dataprep/minio/milvus/langchain/requirements.txt +++ b/comps/dataprep/minio/milvus/langchain/requirements.txt @@ -10,6 +10,7 @@ langchain-community langchain-text-splitters langchain_milvus markdown +minio msgpack numpy openai diff --git a/comps/dataprep/minio/minio_schema.py b/comps/dataprep/minio/minio_schema.py index 97f0c632cc..4fdee03ba1 100644 --- a/comps/dataprep/minio/minio_schema.py +++ b/comps/dataprep/minio/minio_schema.py @@ -1,4 +1,6 @@ -from pydantic import BaseModel, Field +from urllib.parse import unquote + +from pydantic import BaseModel, Field, validator from typing import List, Optional from datetime import datetime @@ -37,12 +39,17 @@ class Config: class S3Object(BaseModel): key: str - size: int - eTag: str - contentType: str - userMetadata: ObjectUserMetadata + size: Optional[int] = None + eTag: Optional[str] = None + contentType: Optional[str] = None + userMetadata: Optional[ObjectUserMetadata] = None sequencer: str + @validator('key') + def decode_key(cls, v): + """Decode URL-encoded key""" + return unquote(v) + class S3(BaseModel): s3SchemaVersion: str configurationId: str From 2d1a56185d4c47c0ac7b0016e0d64fd47b2ea7a9 Mon Sep 17 00:00:00 2001 From: dil Date: Sat, 2 Nov 2024 22:42:31 +0000 Subject: [PATCH 03/28] Update to latest MinIO Image --- comps/dataprep/minio/milvus/langchain/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/milvus/langchain/docker-compose.yml b/comps/dataprep/minio/milvus/langchain/docker-compose.yml index 6c50fe491d..4a756a0444 100644 --- a/comps/dataprep/minio/milvus/langchain/docker-compose.yml +++ b/comps/dataprep/minio/milvus/langchain/docker-compose.yml @@ -23,7 +23,7 @@ services: minio: container_name: milvus-minio - image: minio/minio:RELEASE.2023-03-20T20-16-18Z + image: minio/minio:latest environment: MINIO_ACCESS_KEY: minioadmin MINIO_SECRET_KEY: minioadmin From 92e99c63704cdb7938cf4ef9adb7c8e6ca446e1d Mon Sep 17 00:00:00 2001 From: dil Date: Sat, 2 Nov 2024 22:43:34 +0000 Subject: [PATCH 04/28] Add LnaceDB and MinIO event based dataprep support --- .../minio/lancedb/langchain/Dockerfile | 38 ++ .../minio/lancedb/langchain/README.md | 252 +++++++++ .../minio/lancedb/langchain/__init__.py | 2 + .../minio/lancedb/langchain/config.py | 27 + .../lancedb/langchain/docker-compose.yml | 87 +++ .../lancedb/langchain/prepare_doc_lancedb.py | 500 ++++++++++++++++++ .../minio/lancedb/langchain/requirements.txt | 33 ++ 7 files changed, 939 insertions(+) create mode 100644 comps/dataprep/minio/lancedb/langchain/Dockerfile create mode 100644 comps/dataprep/minio/lancedb/langchain/README.md create mode 100644 comps/dataprep/minio/lancedb/langchain/__init__.py create mode 100644 comps/dataprep/minio/lancedb/langchain/config.py create mode 100644 comps/dataprep/minio/lancedb/langchain/docker-compose.yml create mode 100644 comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py create mode 100644 comps/dataprep/minio/lancedb/langchain/requirements.txt diff --git a/comps/dataprep/minio/lancedb/langchain/Dockerfile b/comps/dataprep/minio/lancedb/langchain/Dockerfile new file mode 100644 index 0000000000..68ab124ceb --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + tesseract-ocr + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/minio/lancedb/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/minio/lancedb/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/minio/lancedb/langchain/uploaded_files + +USER user +WORKDIR /home/user/comps/dataprep/minio/lancedb/langchain + +ENTRYPOINT ["python", "prepare_doc_lancedb.py"] diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md new file mode 100644 index 0000000000..ce7260e879 --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -0,0 +1,252 @@ +# Dataprep Microservice with Milvus + +## 🚀1. Start Microservice with Python (Option 1) + +### 1.1 Requirements + +```bash +pip install -r requirements.txt +apt-get install tesseract-ocr -y +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### 1.2 Start Milvus Server + +Please refer to this [readme](../../../vectorstores/milvus/README.md). + +### 1.3 Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export MILVUS_HOST=${your_milvus_host_ip} +export MILVUS_PORT=19530 +export COLLECTION_NAME=${your_collection_name} +export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} +``` + +### 1.4 Start Mosec Embedding Service + +First, you need to build a mosec embedding serving docker image. + +```bash +cd ../../.. +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . +``` + +Then start the mosec embedding server. + +```bash +your_port=6010 +docker run -d --name="embedding-mosec-endpoint" -p $your_port:8000 opea/embedding-mosec-endpoint:latest +``` + +Setup environment variables: + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS_HOST=${your_host_ip} +``` + +### 1.5 Start Document Preparation Microservice for Milvus with Python Script + +Start document preparation microservice for Milvus with below command. + +```bash +python prepare_doc_milvus.py +``` + +## 🚀2. Start Microservice with Docker (Option 2) + +### 2.1 Start Milvus Server + +Please refer to this [readme](../../../vectorstores/milvus/README.md). + +### 2.2 Build Docker Image + +```bash +cd ../../.. +# build mosec embedding docker image +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . +# build dataprep milvus docker image +docker build -t opea/dataprep-minio-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/milvus/langchain/Dockerfile . +``` + +### 2.3 Setup Environment Variables + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS_HOST=${your_host_ip} +``` + +### 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} opea/dataprep-milvus:latest +``` + +### 2.4 Run with Docker Compose (Option B) + +```bash +mkdir model +cd model +git clone https://huggingface.co/BAAI/bge-base-en-v1.5 +cd ../ +# Update `host_ip` and `HUGGINGFACEHUB_API_TOKEN` in set_env.sh +. set_env.sh +docker compose -f docker-compose-dataprep-milvus.yaml up -d +``` + +## 🚀3. Consume Microservice + +### 3.1 Consume Upload API + +Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + +- Single file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + http://localhost:6010/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500). + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + -F "chunk_size=500" \ + -F "chunk_overlap=100" \ + http://localhost:6010/v1/dataprep +``` + +- Multiple file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.pdf" \ + -F "files=@./file2.pdf" \ + -F "files=@./file3.pdf" \ + http://localhost:6010/v1/dataprep +``` + +- Links upload (not supported for llama_index now) + +```bash +curl -X POST \ + -F 'link_list=["https://www.ces.tech/"]' \ + http://localhost:6010/v1/dataprep +``` + +or + +```python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:6010/v1/dataprep" +urls = [ + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" +] +payload = {"link_list": json.dumps(urls)} + +try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep +``` + +### 3.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6010/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 3.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete all files and links, will drop the entire db collection +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://localhost:6010/v1/dataprep/delete_file +``` + +## 🚀4. Troubleshooting + +1. If you get errors from Mosec Embedding Endpoint like `cannot find this task, maybe it has expired` while uploading files, try to reduce the `chunk_size` in the curl command like below (the default chunk_size=1500). + + ```bash + curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + -F "chunk_size=500" \ + http://localhost:6010/v1/dataprep + ``` diff --git a/comps/dataprep/minio/lancedb/langchain/__init__.py b/comps/dataprep/minio/lancedb/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/lancedb/langchain/config.py b/comps/dataprep/minio/lancedb/langchain/config.py new file mode 100644 index 0000000000..85f53a2661 --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/config.py @@ -0,0 +1,27 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Local Embedding model +LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1") +# TEI Embedding endpoints +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +# LanceDB configuration +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus") +# MOSEC configuration +MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5") +MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "") +MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") +MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") +MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") +MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") +MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") +os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT +os.environ["OPENAI_API_KEY"] = "Dummy key" +os.environ["AWS_ENDPOINT"] = f"http://{MINIO_ENDPOINT}" +os.environ["AWS_ACCESS_KEY_ID"] = MINIO_ACCESS_KEY +os.environ["AWS_SECRET_ACCESS_KEY"] = MINIO_SECRET_KEY +os.environ["ALLOW_HTTP"] = str(MINIO_SECURE != "true").lower() + diff --git a/comps/dataprep/minio/lancedb/langchain/docker-compose.yml b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml new file mode 100644 index 0000000000..bba6cd6cbd --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml @@ -0,0 +1,87 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: '3.5' + +services: + minio: + container_name: minio + image: minio/minio:latest + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "5044:9001" + - "5043:9000" + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + minio-setup: + container_name: minio-mc + image: minio/mc + depends_on: + minio: + condition: service_healthy + dataprep-lancedb-service: + condition: service_started + + environment: + MINIO_URL: ${MINIO_URL:-http://minio:9000} + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + DOCUMENT_WEBHOOK_URL: ${DOCUMENT_WEBHOOK_URL:-http://dataprep-lancedb-service:6010/v1/minio/document/notification} + METADATA_WEBHOOK_URL: ${METADATA_WEBHOOK_URL:-http://dataprep-lancedb-service:6010/v1/minio/metadata/notification} + entrypoint: + - /bin/sh + - -c + - | + set -x; + mc alias set myminio $${MINIO_URL} $${MINIO_ROOT_USER} $${MINIO_ROOT_PASSWORD}; + + # Create buckets + mc mb --ignore-existing myminio/document; + mc mb --ignore-existing myminio/warehouse; + echo 'Created Buckets'; + # Configure webhooks + mc admin config set myminio notify_webhook:document_notify endpoint=\"$${DOCUMENT_WEBHOOK_URL}\"; + mc admin config set myminio notify_webhook:metadata_notify endpoint=\"$${METADATA_WEBHOOK_URL}\"; + echo 'Webhooks setup successfully'; + + # Restart MinIO to apply webhook configurations + mc admin service restart myminio --quiet --json; + echo 'MinIO Service Restarted' + # Wait for MinIO to come back up + sleep 10; + + # Configure event notifications + mc event add myminio/document arn:minio:sqs::document_notify:webhook --event put,delete; + mc event add myminio/warehouse --prefix metadata --suffix .msgpack arn:minio:sqs::metadata_notify:webhook --event put,delete; + + echo 'MinIO setup completed successfully!'; + + dataprep-lancedb-service: + image: opea/dataprep-minio-lancedb:0.1 + container_name: dataprep-lancedb-server + depends_on: + - minio + ports: + - "6010:6010" + volumes: + - "./prepare_doc_lancedb.py:/home/user/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py" + - "./config.py:/home/user/comps/dataprep/minio/lancedb/langchain/config.py" + - "../../minio_schema.py:/home/user/comps/dataprep/minio/minio_schema.py" + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MINIO_ENDPOINT: minio:9000 + COLLECTION_NAME: ${INDEX_NAME:-lance_db} + +networks: + default: + name: lancedb diff --git a/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py new file mode 100644 index 0000000000..1bef45b4bd --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py @@ -0,0 +1,500 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import io +import json + +import lancedb +import msgpack +import os +import tempfile +from pathlib import Path +from typing import List, Optional, Union + +from minio import Minio, S3Error + +from comps.dataprep.minio.minio_schema import MinioEventNotification +from config import ( + COLLECTION_NAME, + LOCAL_EMBEDDING_MODEL, + MINIO_ENDPOINT, + MINIO_ACCESS_KEY, + MINIO_SECRET_KEY, + MINIO_SECURE, + MINIO_DOCUMENT_BUCKET, + MOSEC_EMBEDDING_ENDPOINT, + MOSEC_EMBEDDING_MODEL, + TEI_EMBEDDING_ENDPOINT, + MINIO_WAREHOUSE_BUCKET +) +from fastapi import Body, File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings +from langchain_community.vectorstores import LanceDB +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + decode_filename, + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, +) + +logger = CustomLogger("prepare_doc_minio_lancedb") +logflag = os.getenv("LOGFLAG", True) + +# workaround notes: cp comps/dataprep/utils.py ./lancedb/utils.py +# from utils import document_loader, get_tables_result, parse_html +index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} +partition_field_name = "filename" + +minio_client = Minio( + endpoint=MINIO_ENDPOINT, + access_key=MINIO_ACCESS_KEY, + secret_key=MINIO_SECRET_KEY, + secure=MINIO_SECURE) + + +class MosecEmbeddings(OpenAIEmbeddings): + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + _chunk_size = chunk_size or self.chunk_size + batched_embeddings: List[List[float]] = [] + response = self.client.create(input=texts, **self._invocation_params) + if not isinstance(response, dict): + response = response.model_dump() + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + _cached_empty_embedding: Optional[List[float]] = None + + def empty_embedding() -> List[float]: + nonlocal _cached_empty_embedding + if _cached_empty_embedding is None: + average_embedded = self.client.create(input="", **self._invocation_params) + if not isinstance(average_embedded, dict): + average_embedded = average_embedded.model_dump() + _cached_empty_embedding = average_embedded["data"][0]["embedding"] + return _cached_empty_embedding + + return [e if e is not None else empty_embedding() for e in batched_embeddings] + + +def ingest_chunks_to_lancedb(file_name: str, chunks: List): + if logflag: + logger.info(f"[ ingest chunks ] file name: {file_name}") + + tbl = my_lancedb.get_table() + + # insert documents to lancedb + insert_text = [] + insert_metadata = [] + doc_ids = [] + for i, chunk in enumerate(chunks): + insert_text.append(chunk) + insert_metadata.append({partition_field_name: file_name}) + doc_ids.append(f"{file_name}_{i}") + # Batch size + batch_size = 32 + num_chunks = len(chunks) + + for i in range(0, num_chunks, batch_size): + if logflag: + logger.info(f"[ ingest chunks ] Current batch: {i}") + batch_texts = insert_text[i: i + batch_size] + batch_metadata = insert_metadata[i: i + batch_size] + batch_embeddings = embeddings.embed_documents(batch_texts) + batch_doc_ids = doc_ids[i: i + batch_size] + + data_docs = [] + for j, doc in enumerate(batch_texts): + data_docs.append({ + "text": doc, + "metadata": batch_metadata[j], + "vector": batch_embeddings[j], + "id": batch_doc_ids[j] + }) + + try: + + if tbl is None: + tbl = lancedb.connect("s3://warehouse/v-db").create_table(COLLECTION_NAME, data_docs) + else: + tbl.add(data_docs) + except Exception as e: + if logflag: + logger.info(f"[ ingest chunks ] fail to ingest chunks into lancedb. error: {e}") + raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.") + + if logflag: + logger.info(f"[ ingest chunks ] Docs ingested file {file_name} to lancedb collection {COLLECTION_NAME}.") + + return True + + +def ingest_data_to_minio(doc_path: DocPath): + """Ingest document to lancedb.""" + path = doc_path.path + file_name = path.split("/")[-1] + if logflag: + logger.info(f"[ ingest data ] Parsing document {path}, file name: {file_name}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + if logflag: + logger.info("[ ingest data ] file content loaded") + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original file.") + + return chunks + + +def search_by_file(collection, file_name): + query = f"{partition_field_name} == '{file_name}'" + results = collection.query( + expr=query, + output_fields=[partition_field_name, "pk"], + ) + if logflag: + logger.info(f"[ search by file ] searched by {file_name}") + logger.info(f"[ search by file ] {len(results)} results: {results}") + return results + + +def search_all(collection): + results = collection.search(query="pk >= 0", output_fields=[partition_field_name, "pk"]) + if logflag: + logger.info(f"[ search all ] {len(results)} results: {results}") + return results + + +def delete_all_data(): + if logflag: + logger.info("[ delete all ] deleting all data in lancedb") + # List and delete all objects + try: + # Generate a list of all objects in the bucket + objects = minio_client.list_objects(MINIO_DOCUMENT_BUCKET, recursive=True) + + # Delete each object + for obj in objects: + minio_client.remove_object(MINIO_DOCUMENT_BUCKET, obj.object_name) + print(f"Deleted {obj.object_name}") + + print("All objects have been deleted from the bucket.") + + except S3Error as e: + print("Error:", e) + + +def delete_by_partition_field(my_lancedb, partition_field): + if logflag: + logger.info(f"[ delete partition ] deleting {partition_field_name} {partition_field}") + res = my_lancedb.delete(filter=f"metadata.{partition_field_name} == '{partition_field}'") + if logflag: + logger.info(f"[ delete partition ] delete success: {res}") + + +@register_microservice(name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/dataprep", host="0.0.0.0", + port=6010) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files and link_list: + raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + + for file in files: + encode_file = encode_filename(file.filename) + save_path = f"s3://{MINIO_DOCUMENT_BUCKET}/{encode_file}" + if logflag: + logger.info(f"[ upload ] processing file {save_path}") + + content = await file.read() + file_size = len(content) + file_data = io.BytesIO(content) + + minio_client.put_object( + bucket_name=MINIO_DOCUMENT_BUCKET, + object_name=encode_file, + data=file_data, + length=file_size, + content_type=file.content_type, + metadata={ + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "process_table": process_table, + "table_strategy": table_strategy + }) + + uploaded_files.append(save_path) + if logflag: + logger.info(f"Saved file {save_path} into MinIO") + + results = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(results) + return results + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + + for link in link_list: + encoded_link = encode_filename(link) + + if logflag: + logger.info(f"[ upload ] processing link {encoded_link}") + + encode_file = f"{encoded_link}.txt" + content = parse_html([link])[0][0] + file_size = len(content) + file_data = io.BytesIO(content) + + minio_client.put_object( + bucket_name=MINIO_DOCUMENT_BUCKET, + object_name=encode_file, + data=file_data, + length=file_size, + metadata={ + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "process_table": process_table, + "table_strategy": table_strategy + }) + + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +@register_microservice( + name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/minio/document/notification", host="0.0.0.0", port=6010 +) +async def process_documents(event: MinioEventNotification): + # json_data = await request.json() + # print(json.dumps(json_data, indent=2)) + print(event) + if event.EventName == "s3:ObjectCreated:Put": + for record in event.Records: + bucket_name = record.s3.bucket.name + object_name = record.s3.object.key + _, file_extension = os.path.splitext(object_name) + with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_file: + temp_file_path = temp_file.name + minio_client.fget_object(bucket_name, object_name, temp_file_path) + chunks = ingest_data_to_minio(DocPath( + path=temp_file_path, + chunk_size=record.s3.object.userMetadata.chunk_size, + chunk_overlap=record.s3.object.userMetadata.chunk_overlap, + process_table=record.s3.object.userMetadata.process_table, + table_strategy=record.s3.object.userMetadata.table_strategy, + )) + msgpack_data = msgpack.packb(chunks) + buffer = io.BytesIO(msgpack_data) + buffer_size = buffer.getbuffer().nbytes + minio_client.put_object( + MINIO_WAREHOUSE_BUCKET, + object_name=f"metadata/{object_name}.msgpack", + data=buffer, + length=buffer_size, + content_type='application/x-msgpack' + ) + if event.EventName == "s3:ObjectRemoved:Delete": + for record in event.Records: + object_name = record.s3.object.key + minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, + object_name=f"metadata/{object_name}.msgpack") + return {"status": 200, "message": "Document processed successfully"} + + +@register_microservice( + name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/minio/metadata/notification", host="0.0.0.0", port=6010 +) +async def process_metadata(event: MinioEventNotification): + # json_data = await request.json() + # print(json.dumps(json_data, indent=2)) + if event.EventName == "s3:ObjectCreated:Put": + for record in event.Records: + bucket_name = record.s3.bucket.name + object_name = record.s3.object.key + response = minio_client.get_object(bucket_name, object_name) + msgpack_data = response.read() + response.close() + response.release_conn() + + # Deserialize the MsgPack data back into a list + chunk_list = msgpack.unpackb(msgpack_data) + print(f"Total Chunks are {len(chunk_list)}") + file_name = object_name.split(".msgpack")[0].split("metadata/")[1] + ingest_chunks_to_lancedb(file_name, chunk_list) + elif event.EventName == "s3:ObjectRemoved:Delete": + # define lancedb obj + for record in event.Records: + object_name = record.s3.object.key + file_name = object_name.split(".msgpack")[0].split("metadata/")[1] + encode_file_name = encode_filename(file_name) + try: + delete_by_partition_field(my_lancedb, encode_file_name) + except Exception as e: + if logflag: + logger.info(f"[delete] fail to delete file {file_name}: {e}") + return {"status": False} + + return {"status": 200, "message": "Metadata processed successfully"} + + +@register_microservice( + name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6010 +) +async def rag_get_file_structure(): + if logflag: + logger.info("[ get ] start to get file structure") + + # collection does not exist + if not my_lancedb: + logger.info(f"[ get ] collection {COLLECTION_NAME} does not exist.") + return [] + + # get all files from db + try: + file_objects = minio_client.list_objects(MINIO_DOCUMENT_BUCKET) + except Exception as e: + raise HTTPException(status_code=500, detail="Failed when searching in lancedb db for all files.") + + res_file = [res.object_name for res in file_objects] + if logflag: + logger.info(f"[ get ] unique list from db: {res_file}") + + # construct result file list in format + file_list = [] + for file_name in res_file: + file_dict = { + "name": decode_filename(file_name), + "id": decode_filename(file_name), + "type": "File", + "parent": "", + } + file_list.append(file_dict) + + if logflag: + logger.info(f"[ get ] final file list: {file_list}") + return file_list + + +@register_microservice( + name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6010 +) +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - file/link path (e.g. /path/to/file.txt) + - "all": delete all files uploaded + """ + if logflag: + logger.info(file_path) + + # delete all uploaded files + if file_path == "all": + if logflag: + logger.info("[ delete ] deleting all files") + + delete_all_data() + + if logflag: + logger.info("[ delete ] successfully delete all files.") + + return {"status": True} + + encode_file_name = encode_filename(file_path) + + if logflag: + logger.info(f"[delete] deleting file {encode_file_name}") + try: + minio_client.remove_object(MINIO_DOCUMENT_BUCKET, encode_file_name) + except Exception as e: + if logflag: + logger.info(f"[delete] fail to delete file {encode_file_name}: {e}") + return {"status": False} + if logflag: + logger.info(f"[delete] file {file_path} deleted") + return {"status": True} + + +if __name__ == "__main__": + logger.info("[ prepare_doc_minio_lancedb ] Using MinIO as the object storage.") + # Create vectorstore + if MOSEC_EMBEDDING_ENDPOINT: + # create embeddings using MOSEC endpoint service + if logflag: + logger.info( + f"[ prepare_doc_minio_lancedb ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + ) + embeddings = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) + elif TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + if logflag: + logger.info(f"[ prepare_doc_minio_lancedb ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) + else: + # create embeddings using local embedding model + if logflag: + logger.info(f"[ prepare_doc_minio_lancedb ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") + embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={ + 'device': 'cpu', + 'trust_remote_code': True + }) + # create lancedb + my_lancedb = LanceDB( + uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", + embedding=embeddings, + table_name=COLLECTION_NAME) + + opea_microservices["opea_service@prepare_doc_minio_lancedb"].start() + print("DOCPREP Server Started") diff --git a/comps/dataprep/minio/lancedb/langchain/requirements.txt b/comps/dataprep/minio/lancedb/langchain/requirements.txt new file mode 100644 index 0000000000..6c5163dfb7 --- /dev/null +++ b/comps/dataprep/minio/lancedb/langchain/requirements.txt @@ -0,0 +1,33 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +lancedb +langchain +langchain-community +langchain-text-splitters +markdown +msgpack +minio +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +pytesseract +python-docx +python-pptx +sentence_transformers +s3fs +shortuuid +tiktoken +unstructured[all-docs]==0.15.7 +uvicorn From d6a5a7e51ca11d45f245df82736bf507512836d8 Mon Sep 17 00:00:00 2001 From: dil Date: Sat, 2 Nov 2024 22:44:27 +0000 Subject: [PATCH 05/28] Add MinIO utils file that parses event notifications from MinIO --- comps/dataprep/minio_utils.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 comps/dataprep/minio_utils.py diff --git a/comps/dataprep/minio_utils.py b/comps/dataprep/minio_utils.py new file mode 100644 index 0000000000..e69de29bb2 From 2b13f5d6b821a35f4844600b56d97c4a39ff2ae9 Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 06:00:07 +0000 Subject: [PATCH 06/28] Update README.md files --- comps/dataprep/milvus/langchain/README.md | 36 ++++++----- .../minio/lancedb/langchain/README.md | 60 +++++++++---------- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/comps/dataprep/milvus/langchain/README.md b/comps/dataprep/milvus/langchain/README.md index f349df54cb..1c0ee9f606 100644 --- a/comps/dataprep/milvus/langchain/README.md +++ b/comps/dataprep/milvus/langchain/README.md @@ -1,4 +1,4 @@ -# Dataprep Microservice with Milvus +# Dataprep Microservice with MinIO and Milvus ## 🚀1. Start Microservice with Python (Option 1) @@ -23,6 +23,10 @@ export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} export MILVUS_HOST=${your_milvus_host_ip} export MILVUS_PORT=19530 +export MINIO_ACCESS_KEY=${your_minio_access_key} +export MINIO_SECRET_KEY=${your_minio_secret_key} +export MINIO_ENDPOINT=${your_minio_endpoint} +export MINIO_SECURE = ${your_minio_secure} export COLLECTION_NAME=${your_collection_name} export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} ``` @@ -71,7 +75,7 @@ cd ../../.. # build mosec embedding docker image docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . # build dataprep milvus docker image -docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/milvus/langchain/Dockerfile . +docker build -t opea/dataprep-minio-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/milvus/langchain/Dockerfile . ``` ### 2.3 Setup Environment Variables @@ -79,24 +83,26 @@ docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy ```bash export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" export MILVUS_HOST=${your_host_ip} +export MINIO_ACCESS_KEY=${your_minio_access_key} +export MINIO_SECRET_KEY=${your_minio_secret_key} +export MINIO_ENDPOINT=${your_minio_endpoint} +export MINIO_SECURE = ${your_minio_secure} ``` ### 2.3 Run Docker with CLI (Option A) ```bash -docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} opea/dataprep-milvus:latest -``` - -### 2.4 Run with Docker Compose (Option B) - -```bash -mkdir model -cd model -git clone https://huggingface.co/BAAI/bge-base-en-v1.5 -cd ../ -# Update `host_ip` and `HUGGINGFACEHUB_API_TOKEN` in set_env.sh -. set_env.sh -docker compose -f docker-compose-dataprep-milvus.yaml up -d +docker run -d --name="dataprep-minio-milvus-server" -p 6010:6010 --ipc=host \ +-e http_proxy=$http_proxy \ +-e https_proxy=$https_proxy \ +-e no_proxy=$no_proxy \ +-e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} \ +-e MILVUS_HOST=${MILVUS_HOST} \ +-e MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} \ +-e MINIO_SECRET_KEY=${MINIO_SECRET_KEY} \ +-e MINIO_ENDPOINT=${MINIO_ENDPOINT} \ +-e MINIO_SECURE=${MINIO_SECURE} \ +opea/dataprep-minio-milvus:latest ``` ## 🚀3. Consume Microservice diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index ce7260e879..7e8ce66c99 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -1,4 +1,4 @@ -# Dataprep Microservice with Milvus +# Dataprep Microservice with MinIO and Lancedb ## 🚀1. Start Microservice with Python (Option 1) @@ -11,23 +11,21 @@ apt-get install libtesseract-dev -y apt-get install poppler-utils -y ``` -### 1.2 Start Milvus Server - -Please refer to this [readme](../../../vectorstores/milvus/README.md). - -### 1.3 Setup Environment Variables +### 1.2 Setup Environment Variables ```bash export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export MILVUS_HOST=${your_milvus_host_ip} -export MILVUS_PORT=19530 +export MINIO_ACCESS_KEY=${your_minio_access_key} +export MINIO_SECRET_KEY=${your_minio_secret_key} +export MINIO_ENDPOINT=${your_minio_endpoint} +export MINIO_SECURE = ${your_minio_secure} export COLLECTION_NAME=${your_collection_name} export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} ``` -### 1.4 Start Mosec Embedding Service +### 1.3 Start Mosec Embedding Service First, you need to build a mosec embedding serving docker image. @@ -47,22 +45,21 @@ Setup environment variables: ```bash export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" -export MILVUS_HOST=${your_host_ip} ``` -### 1.5 Start Document Preparation Microservice for Milvus with Python Script +### 1.4 Start Document Preparation Microservice for Lancedb with Python Script -Start document preparation microservice for Milvus with below command. +Start document preparation microservice for Lancedb with below command. ```bash -python prepare_doc_milvus.py +python prepare_doc_lancedb.py ``` ## 🚀2. Start Microservice with Docker (Option 2) -### 2.1 Start Milvus Server +### 2.1 Start Lancedb Server -Please refer to this [readme](../../../vectorstores/milvus/README.md). +Please refer to this [readme](../../../vectorstores/lancedb/README.md). ### 2.2 Build Docker Image @@ -70,40 +67,39 @@ Please refer to this [readme](../../../vectorstores/milvus/README.md). cd ../../.. # build mosec embedding docker image docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . -# build dataprep milvus docker image -docker build -t opea/dataprep-minio-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/milvus/langchain/Dockerfile . +# build dataprep lancedb docker image +docker build -t opea/dataprep-minio-lancedb:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/lancedb/langchain/Dockerfile . ``` ### 2.3 Setup Environment Variables ```bash export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" -export MILVUS_HOST=${your_host_ip} +export MINIO_ACCESS_KEY=${your_minio_access_key} +export MINIO_SECRET_KEY=${your_minio_secret_key} +export MINIO_ENDPOINT=${your_minio_endpoint} +export MINIO_SECURE = ${your_minio_secure} ``` ### 2.3 Run Docker with CLI (Option A) ```bash -docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} opea/dataprep-milvus:latest -``` - -### 2.4 Run with Docker Compose (Option B) - -```bash -mkdir model -cd model -git clone https://huggingface.co/BAAI/bge-base-en-v1.5 -cd ../ -# Update `host_ip` and `HUGGINGFACEHUB_API_TOKEN` in set_env.sh -. set_env.sh -docker compose -f docker-compose-dataprep-milvus.yaml up -d +docker run -d --name="dataprep-lancedb-server" -p 6010:6010 --ipc=host \ +-e http_proxy=$http_proxy -e https_proxy=$https_proxy \ +-e no_proxy=$no_proxy \ +-e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} \ +-e MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} \ +-e MINIO_SECRET_KEY=${MINIO_SECRET_KEY} \ +-e MINIO_ENDPOINT=${MINIO_ENDPOINT} \ +-e MINIO_SECURE=${MINIO_SECURE} \ +opea/dataprep-lancedb:latest ``` ## 🚀3. Consume Microservice ### 3.1 Consume Upload API -Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. +Once document preparation microservice for Lancedb is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. Make sure the file path after `files=@` is correct. From 78064a8aca9d8e89b8f80510d10d1e6082553f53 Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 06:01:06 +0000 Subject: [PATCH 07/28] Add MinIO LanceDB retriever support --- .../minio/lancedb/langchain/Dockerfile | 31 +++++ .../minio/lancedb/langchain/README.md | 108 +++++++++++++++ .../minio/lancedb/langchain/__init__.py | 2 + .../minio/lancedb/langchain/config.py | 25 ++++ .../minio/lancedb/langchain/requirements.txt | 26 ++++ .../lancedb/langchain/retriever_lancedb.py | 126 ++++++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 comps/retrievers/minio/lancedb/langchain/Dockerfile create mode 100644 comps/retrievers/minio/lancedb/langchain/README.md create mode 100644 comps/retrievers/minio/lancedb/langchain/__init__.py create mode 100644 comps/retrievers/minio/lancedb/langchain/config.py create mode 100644 comps/retrievers/minio/lancedb/langchain/requirements.txt create mode 100644 comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py diff --git a/comps/retrievers/minio/lancedb/langchain/Dockerfile b/comps/retrievers/minio/lancedb/langchain/Dockerfile new file mode 100644 index 0000000000..c3232591f3 --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/Dockerfile @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/minio/lancedb/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/minio/lancedb/langchain + +ENTRYPOINT ["python", "retriever_lancedb.py"] diff --git a/comps/retrievers/minio/lancedb/langchain/README.md b/comps/retrievers/minio/lancedb/langchain/README.md new file mode 100644 index 0000000000..d857c3de42 --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/README.md @@ -0,0 +1,108 @@ +# Retriever Microservice with MinIO and Lancedb + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export MINIO_ACCESS_KEY=${your_minio_access_key} +export MINIO_SECRET_KEY=${your_minio_secret_key} +export MINIO_ENDPOINT=${your_minio_endpoint} +export MINIO_SECURE = ${your_minio_secure} +export COLLECTION_NAME=${your_collection_name} +export MOSEC_EMBEDDING_ENDPOINT=${your_emdding_endpoint} +``` + +### Start Retriever Service + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +python retriever_redis.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-minio-lancedb:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/minio/lancedb/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-minio-lancedb-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e MOSEC_EMBEDDING_ENDPOINT=${your_emdding_endpoint} \ + -e MINIO_ENDPOINT=${your_minio_endpoint} \ + -e MINIO_ACCESS_KEY=${your_minio_access_key} \ + -e MINIO_SECRET_KEY=${your_minio_secret_key} \ + -e MINIO_SECURE=${your_minio_secure} \ + opea/retriever-minio-lancedb:latest +``` + +## 🚀3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/minio/lancedb/langchain/__init__.py b/comps/retrievers/minio/lancedb/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/minio/lancedb/langchain/config.py b/comps/retrievers/minio/lancedb/langchain/config.py new file mode 100644 index 0000000000..7d5551799c --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/config.py @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Local Embedding model +LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1") +# TEI Embedding endpoints +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus") +# MOSEC configuration +MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bce-embedding-base_v1") +MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "") +MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") +MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") +MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") +MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") +MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") +os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT +os.environ["OPENAI_API_KEY"] = "Dummy key" +os.environ["AWS_ENDPOINT"] = f"http://{MINIO_ENDPOINT}" +os.environ["AWS_ACCESS_KEY_ID"] = MINIO_ACCESS_KEY +os.environ["AWS_SECRET_ACCESS_KEY"] = MINIO_SECRET_KEY +os.environ["ALLOW_HTTP"] = str(MINIO_SECURE != "true").lower() diff --git a/comps/retrievers/minio/lancedb/langchain/requirements.txt b/comps/retrievers/minio/lancedb/langchain/requirements.txt new file mode 100644 index 0000000000..3a090154ed --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/requirements.txt @@ -0,0 +1,26 @@ +beautifulsoup4 +docarray[full] +easyocr +fastapi +frontend==0.0.3 +huggingface_hub +lancedb +langchain +langchain-community +minio +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic==2.7.3 +pymilvus==2.4.3 +pymupdf==1.24.5 +python-docx==0.8.11 +sentence_transformers +shortuuid +tiktoken +uvicorn diff --git a/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py new file mode 100644 index 0000000000..ce33e5e980 --- /dev/null +++ b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py @@ -0,0 +1,126 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import List, Optional + +from config import ( + COLLECTION_NAME, + LOCAL_EMBEDDING_MODEL, + MOSEC_EMBEDDING_ENDPOINT, + MOSEC_EMBEDDING_MODEL, + TEI_EMBEDDING_ENDPOINT, + MINIO_WAREHOUSE_BUCKET +) +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings +from langchain_community.vectorstores import LanceDB + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("retriever_lancedb") +logflag = os.getenv("LOGFLAG", False) + + +class MosecEmbeddings(OpenAIEmbeddings): + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + _chunk_size = chunk_size or self.chunk_size + batched_embeddings: List[List[float]] = [] + response = self.client.create(input=texts, **self._invocation_params) + if not isinstance(response, dict): + response = response.model_dump() + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + _cached_empty_embedding: Optional[List[float]] = None + + def empty_embedding() -> List[float]: + nonlocal _cached_empty_embedding + if _cached_empty_embedding is None: + average_embedded = self.client.create(input="", **self._invocation_params) + if not isinstance(average_embedded, dict): + average_embedded = average_embedded.model_dump() + _cached_empty_embedding = average_embedded["data"][0]["embedding"] + return _cached_empty_embedding + + return [e if e is not None else empty_embedding() for e in batched_embeddings] + + +@register_microservice( + name="opea_service@retriever_lancedb", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_lancedb"]) +async def retrieve(input: EmbedDoc) -> SearchedDoc: + if logflag: + logger.info(input) + + start = time.time() + if input.search_type == "similarity": + search_res = await vector_db.asimilarity_search_by_vector(embedding=input.embedding, k=input.k) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = await vector_db.asimilarity_search_by_vector( + embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold + ) + elif input.search_type == "similarity_score_threshold": + docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + searched_docs = [] + for r in search_res: + searched_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + statistics_dict["opea_service@retriever_lancedb"].append_latency(time.time() - start, None) + if logflag: + logger.info(result) + return result + + +if __name__ == "__main__": + # Create vectorstore + if MOSEC_EMBEDDING_ENDPOINT: + # create embeddings using Mosec endpoint service + if logflag: + logger.info(f"[ retriever_lancedb ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}") + embeddings = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) + elif TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + if logflag: + logger.info(f"[ retriever_lancedb ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) + else: + # create embeddings using local embedding model + if logflag: + logger.info(f"[ retriever_lancedb ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") + embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={ + 'device': 'cpu', + 'trust_remote_code': True + }) + + vector_db = LanceDB( + uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", + embedding=embeddings, + table_name=COLLECTION_NAME) + + opea_microservices["opea_service@retriever_lancedb"].start() From eb8897dd966112eb4796a8ead82bc3f6bb97cad6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 3 Nov 2024 06:08:36 +0000 Subject: [PATCH 08/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/minio/__init__.py | 2 + .../minio/lancedb/langchain/config.py | 3 +- .../lancedb/langchain/prepare_doc_lancedb.py | 99 +++++++++---------- .../minio/lancedb/langchain/requirements.txt | 4 +- .../dataprep/minio/milvus/langchain/config.py | 5 +- .../milvus/langchain/prepare_doc_milvus.py | 80 ++++++++------- comps/dataprep/minio/minio_schema.py | 24 ++++- comps/dataprep/minio_utils.py | 2 + .../minio/lancedb/langchain/config.py | 2 +- .../lancedb/langchain/retriever_lancedb.py | 18 ++-- 10 files changed, 121 insertions(+), 118 deletions(-) diff --git a/comps/dataprep/minio/__init__.py b/comps/dataprep/minio/__init__.py index e69de29bb2..916f3a44b2 100644 --- a/comps/dataprep/minio/__init__.py +++ b/comps/dataprep/minio/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/lancedb/langchain/config.py b/comps/dataprep/minio/lancedb/langchain/config.py index 85f53a2661..b0940475f6 100644 --- a/comps/dataprep/minio/lancedb/langchain/config.py +++ b/comps/dataprep/minio/lancedb/langchain/config.py @@ -15,7 +15,7 @@ MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") -MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == "true" MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT @@ -24,4 +24,3 @@ os.environ["AWS_ACCESS_KEY_ID"] = MINIO_ACCESS_KEY os.environ["AWS_SECRET_ACCESS_KEY"] = MINIO_SECRET_KEY os.environ["ALLOW_HTTP"] = str(MINIO_SECURE != "true").lower() - diff --git a/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py index 1bef45b4bd..45e4deae55 100644 --- a/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py +++ b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py @@ -2,37 +2,35 @@ # SPDX-License-Identifier: Apache-2.0 import io import json - -import lancedb -import msgpack import os import tempfile from pathlib import Path from typing import List, Optional, Union -from minio import Minio, S3Error - -from comps.dataprep.minio.minio_schema import MinioEventNotification +import lancedb +import msgpack from config import ( COLLECTION_NAME, LOCAL_EMBEDDING_MODEL, - MINIO_ENDPOINT, MINIO_ACCESS_KEY, + MINIO_DOCUMENT_BUCKET, + MINIO_ENDPOINT, MINIO_SECRET_KEY, MINIO_SECURE, - MINIO_DOCUMENT_BUCKET, + MINIO_WAREHOUSE_BUCKET, MOSEC_EMBEDDING_ENDPOINT, MOSEC_EMBEDDING_MODEL, TEI_EMBEDDING_ENDPOINT, - MINIO_WAREHOUSE_BUCKET ) from fastapi import Body, File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings from langchain_community.vectorstores import LanceDB from langchain_text_splitters import HTMLHeaderTextSplitter +from minio import Minio, S3Error from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.minio.minio_schema import MinioEventNotification from comps.dataprep.utils import ( decode_filename, document_loader, @@ -51,15 +49,13 @@ partition_field_name = "filename" minio_client = Minio( - endpoint=MINIO_ENDPOINT, - access_key=MINIO_ACCESS_KEY, - secret_key=MINIO_SECRET_KEY, - secure=MINIO_SECURE) + endpoint=MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=MINIO_SECURE +) class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] @@ -103,19 +99,16 @@ def ingest_chunks_to_lancedb(file_name: str, chunks: List): for i in range(0, num_chunks, batch_size): if logflag: logger.info(f"[ ingest chunks ] Current batch: {i}") - batch_texts = insert_text[i: i + batch_size] - batch_metadata = insert_metadata[i: i + batch_size] + batch_texts = insert_text[i : i + batch_size] + batch_metadata = insert_metadata[i : i + batch_size] batch_embeddings = embeddings.embed_documents(batch_texts) - batch_doc_ids = doc_ids[i: i + batch_size] + batch_doc_ids = doc_ids[i : i + batch_size] data_docs = [] for j, doc in enumerate(batch_texts): - data_docs.append({ - "text": doc, - "metadata": batch_metadata[j], - "vector": batch_embeddings[j], - "id": batch_doc_ids[j] - }) + data_docs.append( + {"text": doc, "metadata": batch_metadata[j], "vector": batch_embeddings[j], "id": batch_doc_ids[j]} + ) try: @@ -224,15 +217,16 @@ def delete_by_partition_field(my_lancedb, partition_field): logger.info(f"[ delete partition ] delete success: {res}") -@register_microservice(name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/dataprep", host="0.0.0.0", - port=6010) +@register_microservice( + name="opea_service@prepare_doc_minio_lancedb", endpoint="/v1/dataprep", host="0.0.0.0", port=6010 +) async def ingest_documents( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1000), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), ): if logflag: logger.info(f"[ upload ] files:{files}") @@ -266,8 +260,9 @@ async def ingest_documents( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "process_table": process_table, - "table_strategy": table_strategy - }) + "table_strategy": table_strategy, + }, + ) uploaded_files.append(save_path) if logflag: @@ -303,8 +298,9 @@ async def ingest_documents( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "process_table": process_table, - "table_strategy": table_strategy - }) + "table_strategy": table_strategy, + }, + ) if logflag: logger.info(f"[ upload ] Successfully saved link list {link_list}") @@ -328,13 +324,15 @@ async def process_documents(event: MinioEventNotification): with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_file: temp_file_path = temp_file.name minio_client.fget_object(bucket_name, object_name, temp_file_path) - chunks = ingest_data_to_minio(DocPath( - path=temp_file_path, - chunk_size=record.s3.object.userMetadata.chunk_size, - chunk_overlap=record.s3.object.userMetadata.chunk_overlap, - process_table=record.s3.object.userMetadata.process_table, - table_strategy=record.s3.object.userMetadata.table_strategy, - )) + chunks = ingest_data_to_minio( + DocPath( + path=temp_file_path, + chunk_size=record.s3.object.userMetadata.chunk_size, + chunk_overlap=record.s3.object.userMetadata.chunk_overlap, + process_table=record.s3.object.userMetadata.process_table, + table_strategy=record.s3.object.userMetadata.table_strategy, + ) + ) msgpack_data = msgpack.packb(chunks) buffer = io.BytesIO(msgpack_data) buffer_size = buffer.getbuffer().nbytes @@ -343,13 +341,12 @@ async def process_documents(event: MinioEventNotification): object_name=f"metadata/{object_name}.msgpack", data=buffer, length=buffer_size, - content_type='application/x-msgpack' + content_type="application/x-msgpack", ) if event.EventName == "s3:ObjectRemoved:Delete": for record in event.Records: object_name = record.s3.object.key - minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, - object_name=f"metadata/{object_name}.msgpack") + minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, object_name=f"metadata/{object_name}.msgpack") return {"status": 200, "message": "Document processed successfully"} @@ -486,15 +483,11 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): # create embeddings using local embedding model if logflag: logger.info(f"[ prepare_doc_minio_lancedb ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") - embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={ - 'device': 'cpu', - 'trust_remote_code': True - }) + embeddings = HuggingFaceBgeEmbeddings( + model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={"device": "cpu", "trust_remote_code": True} + ) # create lancedb - my_lancedb = LanceDB( - uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", - embedding=embeddings, - table_name=COLLECTION_NAME) + my_lancedb = LanceDB(uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", embedding=embeddings, table_name=COLLECTION_NAME) opea_microservices["opea_service@prepare_doc_minio_lancedb"].start() print("DOCPREP Server Started") diff --git a/comps/dataprep/minio/lancedb/langchain/requirements.txt b/comps/dataprep/minio/lancedb/langchain/requirements.txt index 6c5163dfb7..1adb3357b8 100644 --- a/comps/dataprep/minio/lancedb/langchain/requirements.txt +++ b/comps/dataprep/minio/lancedb/langchain/requirements.txt @@ -10,8 +10,8 @@ langchain langchain-community langchain-text-splitters markdown -msgpack minio +msgpack numpy openai opentelemetry-api @@ -25,8 +25,8 @@ pyspark pytesseract python-docx python-pptx -sentence_transformers s3fs +sentence_transformers shortuuid tiktoken unstructured[all-docs]==0.15.7 diff --git a/comps/dataprep/minio/milvus/langchain/config.py b/comps/dataprep/minio/milvus/langchain/config.py index 205b2ecd5a..055dc0194c 100644 --- a/comps/dataprep/minio/milvus/langchain/config.py +++ b/comps/dataprep/minio/milvus/langchain/config.py @@ -16,10 +16,9 @@ MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "") MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") -MINIO_SECRET_KEY=os.environ.get("MINIO_SECRET_KEY", "minioadmin") -MINIO_SECURE=os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") +MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == "true" MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT os.environ["OPENAI_API_KEY"] = "Dummy key" - diff --git a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py index 76ceb5a2af..be8cec8ee2 100644 --- a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py @@ -2,38 +2,37 @@ # SPDX-License-Identifier: Apache-2.0 import io import json -import msgpack import os import tempfile from pathlib import Path from typing import List, Optional, Union -from minio import Minio, S3Error - -from comps.dataprep.minio.milvus.langchain.config import MINIO_WAREHOUSE_BUCKET -from comps.dataprep.minio.minio_schema import MinioEventNotification +import msgpack from config import ( COLLECTION_NAME, LOCAL_EMBEDDING_MODEL, - MINIO_ENDPOINT, - MINIO_ACCESS_KEY, - MINIO_SECRET_KEY, - MINIO_SECURE, MILVUS_HOST, MILVUS_PORT, + MINIO_ACCESS_KEY, MINIO_DOCUMENT_BUCKET, + MINIO_ENDPOINT, + MINIO_SECRET_KEY, + MINIO_SECURE, MOSEC_EMBEDDING_ENDPOINT, MOSEC_EMBEDDING_MODEL, TEI_EMBEDDING_ENDPOINT, ) -from fastapi import Body, File, Form, HTTPException, UploadFile, Request +from fastapi import Body, File, Form, HTTPException, Request, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings from langchain_core.documents import Document from langchain_milvus.vectorstores import Milvus from langchain_text_splitters import HTMLHeaderTextSplitter +from minio import Minio, S3Error from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.minio.milvus.langchain.config import MINIO_WAREHOUSE_BUCKET +from comps.dataprep.minio.minio_schema import MinioEventNotification from comps.dataprep.utils import ( create_upload_folder, decode_filename, @@ -55,15 +54,13 @@ upload_folder = "./uploaded_files/" minio_client = Minio( - endpoint=MINIO_ENDPOINT, - access_key=MINIO_ACCESS_KEY, - secret_key=MINIO_SECRET_KEY, - secure=MINIO_SECURE) + endpoint=MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=MINIO_SECURE +) class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] @@ -102,7 +99,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): for i in range(0, num_chunks, batch_size): if logflag: logger.info(f"[ ingest chunks ] Current batch: {i}") - batch_docs = insert_docs[i: i + batch_size] + batch_docs = insert_docs[i : i + batch_size] try: logger.info(f"MILVUS HOST IS: {MILVUS_HOST}") @@ -224,12 +221,12 @@ def delete_by_partition_field(my_milvus, partition_field): @register_microservice(name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) async def ingest_documents( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1000), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), ): if logflag: logger.info(f"[ upload ] files:{files}") @@ -263,8 +260,9 @@ async def ingest_documents( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "process_table": process_table, - "table_strategy": table_strategy - }) + "table_strategy": table_strategy, + }, + ) uploaded_files.append(save_path) if logflag: @@ -300,8 +298,9 @@ async def ingest_documents( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "process_table": process_table, - "table_strategy": table_strategy - }) + "table_strategy": table_strategy, + }, + ) if logflag: logger.info(f"[ upload ] Successfully saved link list {link_list}") @@ -325,13 +324,15 @@ async def process_documents(event: MinioEventNotification): with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_file: temp_file_path = temp_file.name minio_client.fget_object(bucket_name, object_name, temp_file_path) - chunks = ingest_data_to_minio(DocPath( - path=temp_file_path, - chunk_size=record.s3.object.userMetadata.chunk_size, - chunk_overlap=record.s3.object.userMetadata.chunk_overlap, - process_table=record.s3.object.userMetadata.process_table, - table_strategy=record.s3.object.userMetadata.table_strategy, - )) + chunks = ingest_data_to_minio( + DocPath( + path=temp_file_path, + chunk_size=record.s3.object.userMetadata.chunk_size, + chunk_overlap=record.s3.object.userMetadata.chunk_overlap, + process_table=record.s3.object.userMetadata.process_table, + table_strategy=record.s3.object.userMetadata.table_strategy, + ) + ) msgpack_data = msgpack.packb(chunks) buffer = io.BytesIO(msgpack_data) buffer_size = buffer.getbuffer().nbytes @@ -340,13 +341,12 @@ async def process_documents(event: MinioEventNotification): object_name=f"metadata/{object_name}.msgpack", data=buffer, length=buffer_size, - content_type='application/x-msgpack' + content_type="application/x-msgpack", ) if event.EventName == "s3:ObjectRemoved:Delete": for record in event.Records: object_name = record.s3.object.key - minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, - object_name=f"metadata/{object_name}.msgpack") + minio_client.remove_object(MINIO_WAREHOUSE_BUCKET, object_name=f"metadata/{object_name}.msgpack") return {"status": 200, "message": "Document processed successfully"} @@ -463,7 +463,6 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): if logflag: logger.info("[ delete ] deleting all files") - delete_all_data() if logflag: @@ -507,10 +506,9 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): # create embeddings using local embedding model if logflag: logger.info(f"[ prepare_doc_minio_milvus ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") - embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs = { - 'device': 'cpu', - 'trust_remote_code':True - }) + embeddings = HuggingFaceBgeEmbeddings( + model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={"device": "cpu", "trust_remote_code": True} + ) opea_microservices["opea_service@prepare_doc_minio_milvus"].start() print("DOCPREP Server Started") diff --git a/comps/dataprep/minio/minio_schema.py b/comps/dataprep/minio/minio_schema.py index 4fdee03ba1..d8172794c3 100644 --- a/comps/dataprep/minio/minio_schema.py +++ b/comps/dataprep/minio/minio_schema.py @@ -1,31 +1,40 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from datetime import datetime +from typing import List, Optional from urllib.parse import unquote from pydantic import BaseModel, Field, validator -from typing import List, Optional -from datetime import datetime + class UserIdentity(BaseModel): principalId: str + class RequestParameters(BaseModel): principalId: str region: str sourceIPAddress: str + class ResponseElements(BaseModel): x_amz_id_2: str = Field(..., alias="x-amz-id-2") x_amz_request_id: str = Field(..., alias="x-amz-request-id") x_minio_deployment_id: str = Field(..., alias="x-minio-deployment-id") x_minio_origin_endpoint: str = Field(..., alias="x-minio-origin-endpoint") + class BucketOwnerIdentity(BaseModel): principalId: str + class Bucket(BaseModel): name: str ownerIdentity: BucketOwnerIdentity arn: str + class ObjectUserMetadata(BaseModel): content_type: str = Field(..., alias="content-type") chunk_overlap: Optional[int] = Field(100, alias="X-Amz-Meta-Chunk_overlap") @@ -37,6 +46,7 @@ class Config: populate_by_name = True allow_population_by_field_name = True + class S3Object(BaseModel): key: str size: Optional[int] = None @@ -45,22 +55,25 @@ class S3Object(BaseModel): userMetadata: Optional[ObjectUserMetadata] = None sequencer: str - @validator('key') + @validator("key") def decode_key(cls, v): - """Decode URL-encoded key""" + """Decode URL-encoded key.""" return unquote(v) + class S3(BaseModel): s3SchemaVersion: str configurationId: str bucket: Bucket object: S3Object + class Source(BaseModel): host: str port: str userAgent: str + class Record(BaseModel): eventVersion: str eventSource: str @@ -73,6 +86,7 @@ class Record(BaseModel): s3: S3 source: Source + class MinioEventNotification(BaseModel): EventName: str Key: str @@ -80,4 +94,4 @@ class MinioEventNotification(BaseModel): class Config: from_attributes = True - populate_by_name = True \ No newline at end of file + populate_by_name = True diff --git a/comps/dataprep/minio_utils.py b/comps/dataprep/minio_utils.py index e69de29bb2..916f3a44b2 100644 --- a/comps/dataprep/minio_utils.py +++ b/comps/dataprep/minio_utils.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/minio/lancedb/langchain/config.py b/comps/retrievers/minio/lancedb/langchain/config.py index 7d5551799c..a0b41163ed 100644 --- a/comps/retrievers/minio/lancedb/langchain/config.py +++ b/comps/retrievers/minio/lancedb/langchain/config.py @@ -14,7 +14,7 @@ MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000") MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") -MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == 'true' +MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == "true" MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document") MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse") os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT diff --git a/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py index ce33e5e980..71d833aeb8 100644 --- a/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py +++ b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py @@ -8,10 +8,10 @@ from config import ( COLLECTION_NAME, LOCAL_EMBEDDING_MODEL, + MINIO_WAREHOUSE_BUCKET, MOSEC_EMBEDDING_ENDPOINT, MOSEC_EMBEDDING_MODEL, TEI_EMBEDDING_ENDPOINT, - MINIO_WAREHOUSE_BUCKET ) from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings from langchain_community.vectorstores import LanceDB @@ -34,7 +34,7 @@ class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] @@ -113,14 +113,10 @@ async def retrieve(input: EmbedDoc) -> SearchedDoc: # create embeddings using local embedding model if logflag: logger.info(f"[ retriever_lancedb ] LOCAL_EMBEDDING_MODEL:{LOCAL_EMBEDDING_MODEL}") - embeddings = HuggingFaceBgeEmbeddings(model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={ - 'device': 'cpu', - 'trust_remote_code': True - }) - - vector_db = LanceDB( - uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", - embedding=embeddings, - table_name=COLLECTION_NAME) + embeddings = HuggingFaceBgeEmbeddings( + model_name=LOCAL_EMBEDDING_MODEL, model_kwargs={"device": "cpu", "trust_remote_code": True} + ) + + vector_db = LanceDB(uri=f"s3://{MINIO_WAREHOUSE_BUCKET}/v-db", embedding=embeddings, table_name=COLLECTION_NAME) opea_microservices["opea_service@retriever_lancedb"].start() From 616dc63a17f06dc7c061d55bfc6b17a723616363 Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 06:18:50 +0000 Subject: [PATCH 09/28] Fix README.md paths --- comps/dataprep/minio/lancedb/langchain/README.md | 10 +++------- comps/dataprep/minio/milvus/langchain/README.md | 6 +++--- comps/retrievers/minio/lancedb/langchain/README.md | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index 7e8ce66c99..a89697b771 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -30,7 +30,7 @@ export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} First, you need to build a mosec embedding serving docker image. ```bash -cd ../../.. +cd ../../../.. docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . ``` @@ -57,14 +57,10 @@ python prepare_doc_lancedb.py ## 🚀2. Start Microservice with Docker (Option 2) -### 2.1 Start Lancedb Server - -Please refer to this [readme](../../../vectorstores/lancedb/README.md). - -### 2.2 Build Docker Image +### 2.1 Build Docker Image ```bash -cd ../../.. +cd ../../../.. # build mosec embedding docker image docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . # build dataprep lancedb docker image diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index f349df54cb..bee8c4a11c 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -13,7 +13,7 @@ apt-get install poppler-utils -y ### 1.2 Start Milvus Server -Please refer to this [readme](../../../vectorstores/milvus/README.md). +Please refer to this [readme](../../../../vectorstores/milvus/README.md). ### 1.3 Setup Environment Variables @@ -62,12 +62,12 @@ python prepare_doc_milvus.py ### 2.1 Start Milvus Server -Please refer to this [readme](../../../vectorstores/milvus/README.md). +Please refer to this [readme](../../../../vectorstores/milvus/README.md). ### 2.2 Build Docker Image ```bash -cd ../../.. +cd ../../../.. # build mosec embedding docker image docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . # build dataprep milvus docker image diff --git a/comps/retrievers/minio/lancedb/langchain/README.md b/comps/retrievers/minio/lancedb/langchain/README.md index d857c3de42..80d09c8857 100644 --- a/comps/retrievers/minio/lancedb/langchain/README.md +++ b/comps/retrievers/minio/lancedb/langchain/README.md @@ -34,7 +34,7 @@ python retriever_redis.py ### Build Docker Image ```bash -cd ../../ +cd ../../../../ docker build -t opea/retriever-minio-lancedb:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/minio/lancedb/langchain/Dockerfile . ``` From c05bdcb9ed2569136614ac3634f6e6f0d339a429 Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 06:37:01 +0000 Subject: [PATCH 10/28] Fix README.md for Milvus to right path --- comps/dataprep/milvus/langchain/README.md | 36 ++++++++---------- .../dataprep/minio/milvus/langchain/README.md | 38 +++++++++---------- 2 files changed, 34 insertions(+), 40 deletions(-) diff --git a/comps/dataprep/milvus/langchain/README.md b/comps/dataprep/milvus/langchain/README.md index 1c0ee9f606..f349df54cb 100644 --- a/comps/dataprep/milvus/langchain/README.md +++ b/comps/dataprep/milvus/langchain/README.md @@ -1,4 +1,4 @@ -# Dataprep Microservice with MinIO and Milvus +# Dataprep Microservice with Milvus ## 🚀1. Start Microservice with Python (Option 1) @@ -23,10 +23,6 @@ export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} export MILVUS_HOST=${your_milvus_host_ip} export MILVUS_PORT=19530 -export MINIO_ACCESS_KEY=${your_minio_access_key} -export MINIO_SECRET_KEY=${your_minio_secret_key} -export MINIO_ENDPOINT=${your_minio_endpoint} -export MINIO_SECURE = ${your_minio_secure} export COLLECTION_NAME=${your_collection_name} export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} ``` @@ -75,7 +71,7 @@ cd ../../.. # build mosec embedding docker image docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . # build dataprep milvus docker image -docker build -t opea/dataprep-minio-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/milvus/langchain/Dockerfile . +docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/milvus/langchain/Dockerfile . ``` ### 2.3 Setup Environment Variables @@ -83,26 +79,24 @@ docker build -t opea/dataprep-minio-milvus:latest --build-arg https_proxy=$https ```bash export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" export MILVUS_HOST=${your_host_ip} -export MINIO_ACCESS_KEY=${your_minio_access_key} -export MINIO_SECRET_KEY=${your_minio_secret_key} -export MINIO_ENDPOINT=${your_minio_endpoint} -export MINIO_SECURE = ${your_minio_secure} ``` ### 2.3 Run Docker with CLI (Option A) ```bash -docker run -d --name="dataprep-minio-milvus-server" -p 6010:6010 --ipc=host \ --e http_proxy=$http_proxy \ --e https_proxy=$https_proxy \ --e no_proxy=$no_proxy \ --e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} \ --e MILVUS_HOST=${MILVUS_HOST} \ --e MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} \ --e MINIO_SECRET_KEY=${MINIO_SECRET_KEY} \ --e MINIO_ENDPOINT=${MINIO_ENDPOINT} \ --e MINIO_SECURE=${MINIO_SECURE} \ -opea/dataprep-minio-milvus:latest +docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} opea/dataprep-milvus:latest +``` + +### 2.4 Run with Docker Compose (Option B) + +```bash +mkdir model +cd model +git clone https://huggingface.co/BAAI/bge-base-en-v1.5 +cd ../ +# Update `host_ip` and `HUGGINGFACEHUB_API_TOKEN` in set_env.sh +. set_env.sh +docker compose -f docker-compose-dataprep-milvus.yaml up -d ``` ## 🚀3. Consume Microservice diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index bee8c4a11c..80e099e31b 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -1,4 +1,4 @@ -# Dataprep Microservice with Milvus +# Dataprep Microservice with MinIO and Milvus ## 🚀1. Start Microservice with Python (Option 1) @@ -155,17 +155,17 @@ import json proxies = {"http": ""} url = "http://localhost:6010/v1/dataprep" urls = [ - "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" ] payload = {"link_list": json.dumps(urls)} try: - resp = requests.post(url=url, data=payload, proxies=proxies) - print(resp.text) - resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes - print("Request successful!") + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") except requests.exceptions.RequestException as e: - print("An error occurred:", e) + print("An error occurred:", e) ``` We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". @@ -198,18 +198,18 @@ Then you will get the response JSON like this: ```json [ - { - "name": "uploaded_file_1.txt", - "id": "uploaded_file_1.txt", - "type": "File", - "parent": "" - }, - { - "name": "uploaded_file_2.txt", - "id": "uploaded_file_2.txt", - "type": "File", - "parent": "" - } + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } ] ``` From a65d3dca088eb0f4433b9beed5467451117a7647 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 3 Nov 2024 06:37:29 +0000 Subject: [PATCH 11/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../dataprep/minio/milvus/langchain/README.md | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index 80e099e31b..cdc1b2ddbb 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -155,17 +155,17 @@ import json proxies = {"http": ""} url = "http://localhost:6010/v1/dataprep" urls = [ - "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" ] payload = {"link_list": json.dumps(urls)} try: - resp = requests.post(url=url, data=payload, proxies=proxies) - print(resp.text) - resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes - print("Request successful!") + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") except requests.exceptions.RequestException as e: - print("An error occurred:", e) + print("An error occurred:", e) ``` We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". @@ -198,18 +198,18 @@ Then you will get the response JSON like this: ```json [ - { - "name": "uploaded_file_1.txt", - "id": "uploaded_file_1.txt", - "type": "File", - "parent": "" - }, - { - "name": "uploaded_file_2.txt", - "id": "uploaded_file_2.txt", - "type": "File", - "parent": "" - } + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } ] ``` From 80fe8f627b942ca5fe96ae3bfdc0a99c6f195e20 Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 19:21:13 +0000 Subject: [PATCH 12/28] Fix connection arguments for Milvus --- comps/dataprep/milvus/langchain/prepare_doc_milvus.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py index 3def86f81e..289e183457 100644 --- a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py @@ -93,7 +93,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): batch_docs, embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, partition_key_field=partition_field_name, ) except Exception as e: @@ -211,7 +211,7 @@ async def ingest_documents( my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, index_params=index_params, auto_id=True, ) @@ -347,7 +347,7 @@ async def rag_get_file_structure(): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, index_params=index_params, auto_id=True, ) @@ -405,7 +405,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, index_params=index_params, auto_id=True, ) From 33d38f4da5d11aa529f120c0ae4c4784a621c51e Mon Sep 17 00:00:00 2001 From: dil Date: Sun, 3 Nov 2024 22:23:59 +0000 Subject: [PATCH 13/28] Remove local file volumes from docker compose --- .../minio/lancedb/langchain/docker-compose.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/comps/dataprep/minio/lancedb/langchain/docker-compose.yml b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml index bba6cd6cbd..a519ba98cb 100644 --- a/comps/dataprep/minio/lancedb/langchain/docker-compose.yml +++ b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml @@ -28,15 +28,15 @@ services: depends_on: minio: condition: service_healthy - dataprep-lancedb-service: + dataprep-minio-lancedb-service: condition: service_started environment: MINIO_URL: ${MINIO_URL:-http://minio:9000} MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} - DOCUMENT_WEBHOOK_URL: ${DOCUMENT_WEBHOOK_URL:-http://dataprep-lancedb-service:6010/v1/minio/document/notification} - METADATA_WEBHOOK_URL: ${METADATA_WEBHOOK_URL:-http://dataprep-lancedb-service:6010/v1/minio/metadata/notification} + DOCUMENT_WEBHOOK_URL: ${DOCUMENT_WEBHOOK_URL:-http://dataprep-minio-lancedb-service:6010/v1/minio/document/notification} + METADATA_WEBHOOK_URL: ${METADATA_WEBHOOK_URL:-http://dataprep-minio-lancedb-service:6010/v1/minio/metadata/notification} entrypoint: - /bin/sh - -c @@ -65,17 +65,13 @@ services: echo 'MinIO setup completed successfully!'; - dataprep-lancedb-service: - image: opea/dataprep-minio-lancedb:0.1 + dataprep-minio-lancedb-service: + image: opea/dataprep-minio-lancedb:latest container_name: dataprep-lancedb-server depends_on: - minio ports: - "6010:6010" - volumes: - - "./prepare_doc_lancedb.py:/home/user/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py" - - "./config.py:/home/user/comps/dataprep/minio/lancedb/langchain/config.py" - - "../../minio_schema.py:/home/user/comps/dataprep/minio/minio_schema.py" environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} From c4dfea3d62984103e5b0523f2e0b9eecd2f859f0 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:47:53 -0800 Subject: [PATCH 14/28] Update comps/dataprep/minio/lancedb/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/lancedb/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index a89697b771..7c92ebd731 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -34,7 +34,7 @@ cd ../../../.. docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . ``` -Then start the mosec embedding server. +Then start the Mosec embedding server. ```bash your_port=6010 From 263aac3b3ce5183ea4fb107df121c59ed142e024 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:48:04 -0800 Subject: [PATCH 15/28] Update comps/dataprep/minio/lancedb/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/lancedb/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index 7c92ebd731..8cd42c592d 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -108,7 +108,7 @@ curl -X POST \ http://localhost:6010/v1/dataprep ``` -You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500). +You can specify `chunk_size` with the following commands. To avoid big chunks, pass a small `chunk_size` like 500 as below (default 1500). ```bash curl -X POST \ From 5238a850d4c4641ce6a79b15037b0735eb070bcc Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:48:16 -0800 Subject: [PATCH 16/28] Update comps/dataprep/minio/lancedb/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/lancedb/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index 8cd42c592d..db3e88660c 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -160,7 +160,7 @@ except requests.exceptions.RequestException as e: print("An error occurred:", e) ``` -We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". +We support table extraction from PDF documents. You can specify `process_table` and `table_strategy` by the following commands. `table_strategy` refers to the strategies to understand tables for table retrieval. As the setting progresses from `fast` to `hq` to `llm`, the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is `fast`. Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. From d6142a824c290b42bf1156ef9b8d07450ea26cbc Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:48:39 -0800 Subject: [PATCH 17/28] Update comps/dataprep/minio/lancedb/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/lancedb/langchain/README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/comps/dataprep/minio/lancedb/langchain/README.md b/comps/dataprep/minio/lancedb/langchain/README.md index db3e88660c..74a27f08c7 100644 --- a/comps/dataprep/minio/lancedb/langchain/README.md +++ b/comps/dataprep/minio/lancedb/langchain/README.md @@ -168,14 +168,6 @@ Note: If you specify "table_strategy=llm", You should first start TGI Service, p curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep ``` -We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". - -Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. - -```bash -curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep -``` - ### 3.2 Consume get_file API To get uploaded file structures, use the following command: From b66221ea64ddda982da88a7c8425486b58ab803e Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:48:56 -0800 Subject: [PATCH 18/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index cdc1b2ddbb..9963178c73 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -29,7 +29,7 @@ export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} ### 1.4 Start Mosec Embedding Service -First, you need to build a mosec embedding serving docker image. +First, you need to build a Mosec embedding serving docker image. ```bash cd ../../.. From 491cdcd3de0393ccdf283c89f8e8db5de6d8fb0f Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:49:20 -0800 Subject: [PATCH 19/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index 9963178c73..ceaceebba8 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -36,7 +36,7 @@ cd ../../.. docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile . ``` -Then start the mosec embedding server. +Then start the Mosec embedding server. ```bash your_port=6010 From 94581a7293d57a72f8c6073e0d2b182800805224 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:49:36 -0800 Subject: [PATCH 20/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index ceaceebba8..30a842b529 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -99,9 +99,9 @@ cd ../ docker compose -f docker-compose-dataprep-milvus.yaml up -d ``` -## 🚀3. Consume Microservice +## 🚀3. Use Microservice -### 3.1 Consume Upload API +### 3.1 Use Upload API Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. From 535d09a7c48d22d0d496ad36390538e68ea5c815 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:49:53 -0800 Subject: [PATCH 21/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index 30a842b529..4f40790629 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -103,7 +103,7 @@ docker compose -f docker-compose-dataprep-milvus.yaml up -d ### 3.1 Use Upload API -Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. +Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding, and save it to the database. Make sure the file path after `files=@` is correct. From e726c57f3d55e135623f88139583fe25d1b89673 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:50:06 -0800 Subject: [PATCH 22/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index 4f40790629..ac94a530d5 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -116,7 +116,7 @@ curl -X POST \ http://localhost:6010/v1/dataprep ``` -You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500). +You can specify `chunk_size` with the following commands. To avoid big chunks, pass a small `chunk_size` like 500 as below (default 1500). ```bash curl -X POST \ From fefd9ec97eac1a8bfd206ba468144ebc4a1f8c0c Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:50:17 -0800 Subject: [PATCH 23/28] Update comps/dataprep/minio/milvus/langchain/README.md Co-authored-by: Eero Tamminen --- comps/dataprep/minio/milvus/langchain/README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/comps/dataprep/minio/milvus/langchain/README.md b/comps/dataprep/minio/milvus/langchain/README.md index ac94a530d5..cafea5dd5c 100644 --- a/comps/dataprep/minio/milvus/langchain/README.md +++ b/comps/dataprep/minio/milvus/langchain/README.md @@ -176,14 +176,6 @@ Note: If you specify "table_strategy=llm", You should first start TGI Service, p curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep ``` -We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". - -Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. - -```bash -curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep -``` - ### 3.2 Consume get_file API To get uploaded file structures, use the following command: From 1efea4d92bec7b073da7ab41c115db50ce976fa8 Mon Sep 17 00:00:00 2001 From: dil Date: Wed, 6 Nov 2024 21:05:28 +0000 Subject: [PATCH 24/28] Remove unused code and add appropriate copyrights and minor lint fixex --- comps/dataprep/minio/__init__.py | 2 +- .../minio/lancedb/langchain/Dockerfile | 2 +- .../minio/lancedb/langchain/__init__.py | 2 +- .../minio/lancedb/langchain/config.py | 2 +- .../lancedb/langchain/docker-compose.yml | 2 +- .../lancedb/langchain/prepare_doc_lancedb.py | 18 ++++---- .../minio/milvus/langchain/Dockerfile | 2 +- .../minio/milvus/langchain/__init__.py | 2 +- .../dataprep/minio/milvus/langchain/config.py | 2 +- .../minio/milvus/langchain/docker-compose.yml | 2 +- .../minio/milvus/langchain/milvus.yaml | 2 +- .../milvus/langchain/prepare_doc_milvus.py | 43 +++++++++---------- comps/dataprep/minio/minio_schema.py | 14 +++--- comps/dataprep/minio_utils.py | 2 - .../minio/lancedb/langchain/Dockerfile | 2 +- .../minio/lancedb/langchain/__init__.py | 2 +- .../minio/lancedb/langchain/config.py | 2 +- .../lancedb/langchain/retriever_lancedb.py | 2 +- 18 files changed, 50 insertions(+), 55 deletions(-) delete mode 100644 comps/dataprep/minio_utils.py diff --git a/comps/dataprep/minio/__init__.py b/comps/dataprep/minio/__init__.py index 916f3a44b2..1f58e77f82 100644 --- a/comps/dataprep/minio/__init__.py +++ b/comps/dataprep/minio/__init__.py @@ -1,2 +1,2 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/lancedb/langchain/Dockerfile b/comps/dataprep/minio/lancedb/langchain/Dockerfile index 68ab124ceb..a1f7c56b72 100644 --- a/comps/dataprep/minio/lancedb/langchain/Dockerfile +++ b/comps/dataprep/minio/lancedb/langchain/Dockerfile @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 FROM python:3.11-slim diff --git a/comps/dataprep/minio/lancedb/langchain/__init__.py b/comps/dataprep/minio/lancedb/langchain/__init__.py index 916f3a44b2..1f58e77f82 100644 --- a/comps/dataprep/minio/lancedb/langchain/__init__.py +++ b/comps/dataprep/minio/lancedb/langchain/__init__.py @@ -1,2 +1,2 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/lancedb/langchain/config.py b/comps/dataprep/minio/lancedb/langchain/config.py index b0940475f6..124c4dded8 100644 --- a/comps/dataprep/minio/lancedb/langchain/config.py +++ b/comps/dataprep/minio/lancedb/langchain/config.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import os diff --git a/comps/dataprep/minio/lancedb/langchain/docker-compose.yml b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml index a519ba98cb..20aae71982 100644 --- a/comps/dataprep/minio/lancedb/langchain/docker-compose.yml +++ b/comps/dataprep/minio/lancedb/langchain/docker-compose.yml @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 version: '3.5' diff --git a/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py index 45e4deae55..6e062379eb 100644 --- a/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py +++ b/comps/dataprep/minio/lancedb/langchain/prepare_doc_lancedb.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import io import json @@ -45,8 +45,8 @@ # workaround notes: cp comps/dataprep/utils.py ./lancedb/utils.py # from utils import document_loader, get_tables_result, parse_html -index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} -partition_field_name = "filename" +INDEX_PARAMS = {"index_type": "FLAT", "metric_type": "IP", "params": {}} +PARTITION_FIELD_NAME = "filename" minio_client = Minio( endpoint=MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=MINIO_SECURE @@ -90,7 +90,7 @@ def ingest_chunks_to_lancedb(file_name: str, chunks: List): doc_ids = [] for i, chunk in enumerate(chunks): insert_text.append(chunk) - insert_metadata.append({partition_field_name: file_name}) + insert_metadata.append({PARTITION_FIELD_NAME: file_name}) doc_ids.append(f"{file_name}_{i}") # Batch size batch_size = 32 @@ -172,10 +172,10 @@ def ingest_data_to_minio(doc_path: DocPath): def search_by_file(collection, file_name): - query = f"{partition_field_name} == '{file_name}'" + query = f"{PARTITION_FIELD_NAME} == '{file_name}'" results = collection.query( expr=query, - output_fields=[partition_field_name, "pk"], + output_fields=[PARTITION_FIELD_NAME, "pk"], ) if logflag: logger.info(f"[ search by file ] searched by {file_name}") @@ -184,7 +184,7 @@ def search_by_file(collection, file_name): def search_all(collection): - results = collection.search(query="pk >= 0", output_fields=[partition_field_name, "pk"]) + results = collection.search(query="pk >= 0", output_fields=[PARTITION_FIELD_NAME, "pk"]) if logflag: logger.info(f"[ search all ] {len(results)} results: {results}") return results @@ -211,8 +211,8 @@ def delete_all_data(): def delete_by_partition_field(my_lancedb, partition_field): if logflag: - logger.info(f"[ delete partition ] deleting {partition_field_name} {partition_field}") - res = my_lancedb.delete(filter=f"metadata.{partition_field_name} == '{partition_field}'") + logger.info(f"[ delete partition ] deleting {PARTITION_FIELD_NAME} {partition_field}") + res = my_lancedb.delete(filter=f"metadata.{PARTITION_FIELD_NAME} == '{partition_field}'") if logflag: logger.info(f"[ delete partition ] delete success: {res}") diff --git a/comps/dataprep/minio/milvus/langchain/Dockerfile b/comps/dataprep/minio/milvus/langchain/Dockerfile index 5766bb24de..d25d3bb78f 100644 --- a/comps/dataprep/minio/milvus/langchain/Dockerfile +++ b/comps/dataprep/minio/milvus/langchain/Dockerfile @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 FROM python:3.11-slim diff --git a/comps/dataprep/minio/milvus/langchain/__init__.py b/comps/dataprep/minio/milvus/langchain/__init__.py index 916f3a44b2..1f58e77f82 100644 --- a/comps/dataprep/minio/milvus/langchain/__init__.py +++ b/comps/dataprep/minio/milvus/langchain/__init__.py @@ -1,2 +1,2 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/minio/milvus/langchain/config.py b/comps/dataprep/minio/milvus/langchain/config.py index 055dc0194c..286fae168c 100644 --- a/comps/dataprep/minio/milvus/langchain/config.py +++ b/comps/dataprep/minio/milvus/langchain/config.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import os diff --git a/comps/dataprep/minio/milvus/langchain/docker-compose.yml b/comps/dataprep/minio/milvus/langchain/docker-compose.yml index 4a756a0444..2635f5d98c 100644 --- a/comps/dataprep/minio/milvus/langchain/docker-compose.yml +++ b/comps/dataprep/minio/milvus/langchain/docker-compose.yml @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 version: '3.5' diff --git a/comps/dataprep/minio/milvus/langchain/milvus.yaml b/comps/dataprep/minio/milvus/langchain/milvus.yaml index 52962b8342..7aff1d4058 100644 --- a/comps/dataprep/minio/milvus/langchain/milvus.yaml +++ b/comps/dataprep/minio/milvus/langchain/milvus.yaml @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 # Licensed to the LF AI & Data foundation under one diff --git a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py index be8cec8ee2..6a5f3aa688 100644 --- a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import io import json @@ -49,9 +49,8 @@ # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py # from utils import document_loader, get_tables_result, parse_html -index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} -partition_field_name = "filename" -upload_folder = "./uploaded_files/" +INDEX_PARAMS = {"index_type": "FLAT", "metric_type": "IP", "params": {}} +PARTITION_FIELD_NAME = "filename" minio_client = Minio( endpoint=MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=MINIO_SECURE @@ -60,7 +59,7 @@ class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] @@ -90,7 +89,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): # insert documents to Milvus insert_docs = [] for chunk in chunks: - insert_docs.append(Document(page_content=chunk, metadata={partition_field_name: file_name})) + insert_docs.append(Document(page_content=chunk, metadata={PARTITION_FIELD_NAME: file_name})) # Batch size batch_size = 32 @@ -99,7 +98,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): for i in range(0, num_chunks, batch_size): if logflag: logger.info(f"[ ingest chunks ] Current batch: {i}") - batch_docs = insert_docs[i : i + batch_size] + batch_docs = insert_docs[i: i + batch_size] try: logger.info(f"MILVUS HOST IS: {MILVUS_HOST}") @@ -108,7 +107,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): embeddings, collection_name=COLLECTION_NAME, connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, - partition_key_field=partition_field_name, + partition_key_field=PARTITION_FIELD_NAME, ) except Exception as e: if logflag: @@ -166,10 +165,10 @@ def ingest_data_to_minio(doc_path: DocPath): def search_by_file(collection, file_name): - query = f"{partition_field_name} == '{file_name}'" + query = f"{PARTITION_FIELD_NAME} == '{file_name}'" results = collection.query( expr=query, - output_fields=[partition_field_name, "pk"], + output_fields=[PARTITION_FIELD_NAME, "pk"], ) if logflag: logger.info(f"[ search by file ] searched by {file_name}") @@ -178,7 +177,7 @@ def search_by_file(collection, file_name): def search_all(collection): - results = collection.query(expr="pk >= 0", output_fields=[partition_field_name, "pk"]) + results = collection.query(expr="pk >= 0", output_fields=[PARTITION_FIELD_NAME, "pk"]) if logflag: logger.info(f"[ search all ] {len(results)} results: {results}") return results @@ -209,8 +208,8 @@ def delete_all_data(): def delete_by_partition_field(my_milvus, partition_field): if logflag: - logger.info(f"[ delete partition ] deleting {partition_field_name} {partition_field}") - pks = my_milvus.get_pks(f'{partition_field_name} == "{partition_field}"') + logger.info(f"[ delete partition ] deleting {PARTITION_FIELD_NAME} {partition_field}") + pks = my_milvus.get_pks(f'{PARTITION_FIELD_NAME} == "{partition_field}"') if logflag: logger.info(f"[ delete partition ] target pks: {pks}") res = my_milvus.delete(pks) @@ -221,12 +220,12 @@ def delete_by_partition_field(my_milvus, partition_field): @register_microservice(name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) async def ingest_documents( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1000), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), ): if logflag: logger.info(f"[ upload ] files:{files}") @@ -376,7 +375,7 @@ async def process_metadata(event: MinioEventNotification): embedding_function=embeddings, collection_name=COLLECTION_NAME, connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, - index_params=index_params, + index_params=INDEX_PARAMS, auto_id=True, ) for record in event.Records: @@ -405,7 +404,7 @@ async def rag_get_file_structure(): embedding_function=embeddings, collection_name=COLLECTION_NAME, connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"}, - index_params=index_params, + index_params=INDEX_PARAMS, auto_id=True, ) @@ -486,8 +485,6 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): if __name__ == "__main__": - create_upload_folder(upload_folder) - print(f"upload folder {upload_folder} created at {Path(upload_folder).absolute()}") # Create vectorstore if MOSEC_EMBEDDING_ENDPOINT: diff --git a/comps/dataprep/minio/minio_schema.py b/comps/dataprep/minio/minio_schema.py index d8172794c3..474453db28 100644 --- a/comps/dataprep/minio/minio_schema.py +++ b/comps/dataprep/minio/minio_schema.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 from datetime import datetime @@ -61,20 +61,20 @@ def decode_key(cls, v): return unquote(v) -class S3(BaseModel): +class MinIOS3(BaseModel): s3SchemaVersion: str configurationId: str bucket: Bucket object: S3Object -class Source(BaseModel): +class MinIOSource(BaseModel): host: str port: str userAgent: str -class Record(BaseModel): +class MinIORecord(BaseModel): eventVersion: str eventSource: str awsRegion: str @@ -83,14 +83,14 @@ class Record(BaseModel): userIdentity: UserIdentity requestParameters: RequestParameters responseElements: ResponseElements - s3: S3 - source: Source + s3: MinIOS3 = Field(..., alias="s3") + source: MinIOSource = Field(..., alias="source") class MinioEventNotification(BaseModel): EventName: str Key: str - Records: List[Record] + Records: List[MinIORecord] = Field(..., alias="Records") class Config: from_attributes = True diff --git a/comps/dataprep/minio_utils.py b/comps/dataprep/minio_utils.py deleted file mode 100644 index 916f3a44b2..0000000000 --- a/comps/dataprep/minio_utils.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/minio/lancedb/langchain/Dockerfile b/comps/retrievers/minio/lancedb/langchain/Dockerfile index c3232591f3..e9addf4314 100644 --- a/comps/retrievers/minio/lancedb/langchain/Dockerfile +++ b/comps/retrievers/minio/lancedb/langchain/Dockerfile @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 FROM python:3.11-slim diff --git a/comps/retrievers/minio/lancedb/langchain/__init__.py b/comps/retrievers/minio/lancedb/langchain/__init__.py index 916f3a44b2..1f58e77f82 100644 --- a/comps/retrievers/minio/lancedb/langchain/__init__.py +++ b/comps/retrievers/minio/lancedb/langchain/__init__.py @@ -1,2 +1,2 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/minio/lancedb/langchain/config.py b/comps/retrievers/minio/lancedb/langchain/config.py index a0b41163ed..207aef94e3 100644 --- a/comps/retrievers/minio/lancedb/langchain/config.py +++ b/comps/retrievers/minio/lancedb/langchain/config.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import os diff --git a/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py index 71d833aeb8..8e05ad9662 100644 --- a/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py +++ b/comps/retrievers/minio/lancedb/langchain/retriever_lancedb.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (c) 2015-2024 MinIO, Inc. # SPDX-License-Identifier: Apache-2.0 import os From 29526118fe863fee2e4c59ee1587ad88713d9868 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Nov 2024 21:07:51 +0000 Subject: [PATCH 25/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../minio/milvus/langchain/prepare_doc_milvus.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py index 6a5f3aa688..ca1a021e17 100644 --- a/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/minio/milvus/langchain/prepare_doc_milvus.py @@ -59,7 +59,7 @@ class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] @@ -98,7 +98,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): for i in range(0, num_chunks, batch_size): if logflag: logger.info(f"[ ingest chunks ] Current batch: {i}") - batch_docs = insert_docs[i: i + batch_size] + batch_docs = insert_docs[i : i + batch_size] try: logger.info(f"MILVUS HOST IS: {MILVUS_HOST}") @@ -220,12 +220,12 @@ def delete_by_partition_field(my_milvus, partition_field): @register_microservice(name="opea_service@prepare_doc_minio_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) async def ingest_documents( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1000), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), ): if logflag: logger.info(f"[ upload ] files:{files}") From ac5413f45cc1017d9be8f112c66b9eac84c153e1 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:59:34 -0800 Subject: [PATCH 26/28] Added new docker files to github workflow --- .github/workflows/docker/compose/dataprep-compose.yaml | 8 ++++++++ .github/workflows/docker/compose/retrievers-compose.yaml | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index d18c141c84..da62a7e523 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -59,3 +59,11 @@ services: build: dockerfile: comps/dataprep/opensearch/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-opensearch:${TAG:-latest} + dataprep-minio-lancedb: + build: + dockerfile: comps/dataprep/minio/lancedb/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-minio-lancedb:${TAG:-latest} + dataprep-minio-milvus: + build: + dockerfile: comps/dataprep/minio/milvus/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-minio-milvus:${TAG:-latest} \ No newline at end of file diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 00d95fe6b7..bb7b9557d7 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -51,3 +51,7 @@ services: build: dockerfile: comps/retrievers/opensearch/langchain/Dockerfile image: ${REGISTRY:-opea}/retriever-opensearch:${TAG:-latest} + retriever-minio-lancedb: + build: + dockerfile: comps/retrievers/minio/lancedb/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-minio-lancedb:${TAG:-latest} \ No newline at end of file From fba854aa36a3ca8538d9e2d93263aab2dfbd3f94 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:40:51 -0800 Subject: [PATCH 27/28] Add new lines to the workflows --- .../docker/compose/dataprep-compose.yaml | 129 +++++++++--------- .../docker/compose/retrievers-compose.yaml | 3 +- 2 files changed, 67 insertions(+), 65 deletions(-) diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index da62a7e523..c94f5eca77 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -3,67 +3,68 @@ # this file should be run in the root of the repo services: - dataprep-redis: - build: - dockerfile: comps/dataprep/redis/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} - dataprep-qdrant: - build: - dockerfile: comps/dataprep/qdrant/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-qdrant:${TAG:-latest} - dataprep-on-ray-redis: - build: - dockerfile: comps/dataprep/redis/langchain_ray/Dockerfile - image: ${REGISTRY:-opea}/dataprep-on-ray-redis:${TAG:-latest} - dataprep-multimodal-vdms: - build: - dockerfile: comps/dataprep/vdms/multimodal_langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-multimodal-vdms:${TAG:-latest} - dataprep-multimodal-redis: - build: - dockerfile: comps/dataprep/multimodal/redis/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest} - dataprep-redis-llama-index: - build: - dockerfile: comps/dataprep/redis/llama_index/Dockerfile - image: ${REGISTRY:-opea}/dataprep-redis-llama-index:${TAG:-latest} - dataprep-milvus: - build: - dockerfile: comps/dataprep/milvus/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-milvus:${TAG:-latest} - dataprep-pgvector: - build: - dockerfile: comps/dataprep/pgvector/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-pgvector:${TAG:-latest} - dataprep-pinecone: - build: - dockerfile: comps/dataprep/pinecone/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} - dataprep-vdms: - build: - dockerfile: comps/dataprep/vdms/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest} - dataprep-neo4j: - build: - dockerfile: comps/dataprep/neo4j/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-neo4j:${TAG:-latest} - dataprep-neo4j-llamaindex: - build: - dockerfile: comps/dataprep/neo4j/llama_index/Dockerfile - image: ${REGISTRY:-opea}/dataprep-neo4j-llamaindex:${TAG:-latest} - dataprep-elasticsearch: - build: - dockerfile: comps/dataprep/elasticsearch/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-elasticsearch:${TAG:-latest} - dataprep-opensearch: - build: - dockerfile: comps/dataprep/opensearch/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-opensearch:${TAG:-latest} - dataprep-minio-lancedb: - build: - dockerfile: comps/dataprep/minio/lancedb/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-minio-lancedb:${TAG:-latest} - dataprep-minio-milvus: - build: - dockerfile: comps/dataprep/minio/milvus/langchain/Dockerfile - image: ${REGISTRY:-opea}/dataprep-minio-milvus:${TAG:-latest} \ No newline at end of file + dataprep-redis: + build: + dockerfile: comps/dataprep/redis/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} + dataprep-qdrant: + build: + dockerfile: comps/dataprep/qdrant/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-qdrant:${TAG:-latest} + dataprep-on-ray-redis: + build: + dockerfile: comps/dataprep/redis/langchain_ray/Dockerfile + image: ${REGISTRY:-opea}/dataprep-on-ray-redis:${TAG:-latest} + dataprep-multimodal-vdms: + build: + dockerfile: comps/dataprep/vdms/multimodal_langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-multimodal-vdms:${TAG:-latest} + dataprep-multimodal-redis: + build: + dockerfile: comps/dataprep/multimodal/redis/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest} + dataprep-redis-llama-index: + build: + dockerfile: comps/dataprep/redis/llama_index/Dockerfile + image: ${REGISTRY:-opea}/dataprep-redis-llama-index:${TAG:-latest} + dataprep-milvus: + build: + dockerfile: comps/dataprep/milvus/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-milvus:${TAG:-latest} + dataprep-pgvector: + build: + dockerfile: comps/dataprep/pgvector/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-pgvector:${TAG:-latest} + dataprep-pinecone: + build: + dockerfile: comps/dataprep/pinecone/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} + dataprep-vdms: + build: + dockerfile: comps/dataprep/vdms/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest} + dataprep-neo4j: + build: + dockerfile: comps/dataprep/neo4j/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-neo4j:${TAG:-latest} + dataprep-neo4j-llamaindex: + build: + dockerfile: comps/dataprep/neo4j/llama_index/Dockerfile + image: ${REGISTRY:-opea}/dataprep-neo4j-llamaindex:${TAG:-latest} + dataprep-elasticsearch: + build: + dockerfile: comps/dataprep/elasticsearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-elasticsearch:${TAG:-latest} + dataprep-opensearch: + build: + dockerfile: comps/dataprep/opensearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-opensearch:${TAG:-latest} + dataprep-minio-lancedb: + build: + dockerfile: comps/dataprep/minio/lancedb/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-minio-lancedb:${TAG:-latest} + dataprep-minio-milvus: + build: + dockerfile: comps/dataprep/minio/milvus/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-minio-milvus:${TAG:-latest} + \ No newline at end of file diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index bb7b9557d7..a648a2250c 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -54,4 +54,5 @@ services: retriever-minio-lancedb: build: dockerfile: comps/retrievers/minio/lancedb/langchain/Dockerfile - image: ${REGISTRY:-opea}/retriever-minio-lancedb:${TAG:-latest} \ No newline at end of file + image: ${REGISTRY:-opea}/retriever-minio-lancedb:${TAG:-latest} + \ No newline at end of file From d526d2847d1a3f3c0054bcf6c686824d4f728466 Mon Sep 17 00:00:00 2001 From: dilverse <109769432+dilverse@users.noreply.github.com> Date: Fri, 27 Dec 2024 12:06:32 -0800 Subject: [PATCH 28/28] Remove trailing spaces --- .github/workflows/docker/compose/dataprep-compose.yaml | 1 - .github/workflows/docker/compose/retrievers-compose.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index c94f5eca77..2a4b92ca04 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -67,4 +67,3 @@ services: build: dockerfile: comps/dataprep/minio/milvus/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-minio-milvus:${TAG:-latest} - \ No newline at end of file diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index a648a2250c..8835df36ac 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -55,4 +55,3 @@ services: build: dockerfile: comps/retrievers/minio/lancedb/langchain/Dockerfile image: ${REGISTRY:-opea}/retriever-minio-lancedb:${TAG:-latest} - \ No newline at end of file