From f234506c24e342f8984a0d3acc0ad067708c7959 Mon Sep 17 00:00:00 2001 From: Kesha Williams Date: Wed, 10 Jun 2026 22:08:45 +0000 Subject: [PATCH] Add semantic search workshop Signed-off-by: Kesha Williams --- .../.devcontainer/cache_model.py | 10 + .../.devcontainer/devcontainer.json | 26 ++ .../.devcontainer/requirements.txt | 20 + .../.env.example | 6 + .../semantic-search-github-issues/.gitignore | 20 + .../semantic-search-github-issues/README.md | 87 ++++ .../notebook.ipynb | 414 ++++++++++++++++++ 7 files changed, 583 insertions(+) create mode 100644 workshops/semantic-search-github-issues/.devcontainer/cache_model.py create mode 100644 workshops/semantic-search-github-issues/.devcontainer/devcontainer.json create mode 100644 workshops/semantic-search-github-issues/.devcontainer/requirements.txt create mode 100644 workshops/semantic-search-github-issues/.env.example create mode 100644 workshops/semantic-search-github-issues/.gitignore create mode 100644 workshops/semantic-search-github-issues/README.md create mode 100644 workshops/semantic-search-github-issues/notebook.ipynb diff --git a/workshops/semantic-search-github-issues/.devcontainer/cache_model.py b/workshops/semantic-search-github-issues/.devcontainer/cache_model.py new file mode 100644 index 00000000..299eb740 --- /dev/null +++ b/workshops/semantic-search-github-issues/.devcontainer/cache_model.py @@ -0,0 +1,10 @@ +""" +Pre-cache the embedding model during Codespace build. +""" +from sentence_transformers import SentenceTransformer + +MODEL = "sentence-transformers/all-MiniLM-L6-v2" + +print(f"Pre-caching {MODEL}...") +SentenceTransformer(MODEL) +print("Done. Model is cached and ready.") diff --git a/workshops/semantic-search-github-issues/.devcontainer/devcontainer.json b/workshops/semantic-search-github-issues/.devcontainer/devcontainer.json new file mode 100644 index 00000000..ed2f8523 --- /dev/null +++ b/workshops/semantic-search-github-issues/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +{ + "name": "Oracle AI Vector Search Tutorial", + "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bookworm", + + "customizations": { + "vscode": { + "extensions": ["ms-python.python", "ms-toolsai.jupyter", "ms-toolsai.jupyter-renderers"], + "settings": { + "python.defaultInterpreterPath": "/usr/local/bin/python", + "jupyter.askForKernelRestart": false + } + } + }, + + "onCreateCommand": "pip install --no-cache-dir -r .devcontainer/requirements.txt && python .devcontainer/cache_model.py", + + "remoteEnv": { + "HF_HOME": "/home/vscode/.cache/huggingface" + }, + + "postAttachCommand": { + "open-notebook": "code notebook.ipynb" + }, + + "remoteUser": "vscode" +} diff --git a/workshops/semantic-search-github-issues/.devcontainer/requirements.txt b/workshops/semantic-search-github-issues/.devcontainer/requirements.txt new file mode 100644 index 00000000..3972a3bc --- /dev/null +++ b/workshops/semantic-search-github-issues/.devcontainer/requirements.txt @@ -0,0 +1,20 @@ +# Oracle integration +langchain-oracledb>=0.1.0 +oracledb>=2.5.0 + +# LangChain core +langchain-core>=0.3.0 +langchain-community>=0.3.0 +langchain-huggingface>=0.1.0 + +# Embeddings +sentence-transformers>=3.0.0 + +# HTTP for GitHub API +requests>=2.31.0 + +# .env file support +python-dotenv>=1.0.0 + +# Notebook runtime +ipykernel>=6.29.0 diff --git a/workshops/semantic-search-github-issues/.env.example b/workshops/semantic-search-github-issues/.env.example new file mode 100644 index 00000000..81dba59b --- /dev/null +++ b/workshops/semantic-search-github-issues/.env.example @@ -0,0 +1,6 @@ +# FreeSQL credentials +# Get these from freesql.com -> Connect dialog -> Regenerate password +# Your DSN is shown in the same Connect dialog. The service name suffix +ORACLE_USER=your_freesql_schema_name +ORACLE_PASSWORD=your_freesql_password +ORACLE_DSN=tcps://db.freesql.com:XXXX/your_service_name diff --git a/workshops/semantic-search-github-issues/.gitignore b/workshops/semantic-search-github-issues/.gitignore new file mode 100644 index 00000000..7cb358ae --- /dev/null +++ b/workshops/semantic-search-github-issues/.gitignore @@ -0,0 +1,20 @@ +# Secrets +.env + +# Python +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +.pytest_cache/ + +# Jupyter +.ipynb_checkpoints/ + +# OS +.DS_Store +Thumbs.db + +# Editor +.vscode/ +*.swp diff --git a/workshops/semantic-search-github-issues/README.md b/workshops/semantic-search-github-issues/README.md new file mode 100644 index 00000000..62e15d44 --- /dev/null +++ b/workshops/semantic-search-github-issues/README.md @@ -0,0 +1,87 @@ +# Semantic Search over GitHub Issues Workshop + +**Build a working semantic search engine over real GitHub issues with Oracle AI Database 26ai and `langchain-oracledb` in 10 minutes** + +--- + +## What You Will Build + +Starting from a public GitHub repository, you will build a semantic search engine that finds bug reports by **meaning** rather than keywords. You'll pull 15 issues from the `oracle/python-oracledb` repo via the GitHub REST API, store them as vector embeddings in Oracle AI Database 26ai using `langchain-oracledb`, and run similarity queries with metadata filters. By the end you'll see why keyword search fails on the same query, how hybrid filtering combines vector ranking with structured WHERE clauses, and what the underlying `VECTOR_DISTANCE` SQL looks like. + +The workshop runs entirely against [FreeSQL](https://freesql.com), Oracle's free browser-based AI Database sandbox. + +## Getting Started + +This workshop lives inside the [oracle-ai-developer-hub](https://github.com/oracle-devrel/oracle-ai-developer-hub) repository. Use **git sparse-checkout** to pull just this workshop without cloning the rest of the hub: + +```bash +# Clone the hub with no files and no blobs +git clone --filter=blob:none --no-checkout https://github.com/oracle-devrel/oracle-ai-developer-hub.git +cd oracle-ai-developer-hub + +# Enable sparse-checkout and select only this workshop +git sparse-checkout init --cone +git sparse-checkout set workshops/semantic_search_github_issues + +# Materialise the files and move into the workshop +git checkout main +cd workshops/semantic_search_github_issues + +# Install dependencies +pip install -r .devcontainer/requirements.txt + +# Set up your FreeSQL credentials +cp .env.example .env +# Edit .env with credentials from freesql.com → Connect → Python tab + +# Launch Jupyter +jupyter lab notebook.ipynb +``` + +> **Updating later:** `git pull` from inside `oracle-ai-developer-hub` refreshes only the paths you've selected with sparse-checkout. + +## Workshop Files + +``` +semantic_search_github_issues/ +├── .devcontainer/ +│ ├── devcontainer.json Dev container configuration +│ ├── requirements.txt Python dependencies (pinned minimums) +│ └── cache_model.py Pre-caches embedding model during build +├── .env.example Credential template +├── .gitignore Excludes .env from commits +├── notebook.ipynb Workshop notebook (10 cells) +└── README.md +``` + +## Stack + +- **Oracle AI Database 26ai** via [FreeSQL](https://freesql.com) — vector storage and search, no local install +- `langchain-oracledb` — Python vector store integration +- `sentence-transformers` — local embedding model (`all-MiniLM-L6-v2`, 384-dim), no API key needed +- `python-oracledb` thin mode — pure Python Oracle driver, no client libraries to install + +## What the Notebook Covers + +| Cell | What it does | +| ---- | --------------------------------------------------------------------------- | +| 1 | Connect to FreeSQL via `python-oracledb` thin mode, credentials from `.env` | +| 2 | Pull 15 recent issues from `oracle/python-oracledb` via GitHub REST API | +| 3 | Shape issues into LangChain `Document` objects with metadata | +| 4 | Load the `all-MiniLM-L6-v2` embedding model | +| 5 | `OracleVS.from_documents()` (creates table, embeds, inserts in one call) | +| 6 | Similarity search for "connection pool errors" | +| 7 | Same query as a SQL `LIKE` (returns zero matches) | +| 8 | Hybrid filter: vector similarity + `state=open` | +| 9 | Behind the abstraction: raw SQL with `VECTOR_DISTANCE` and `JSON_VALUE` | +| 10 | Cleanup (drop the demo table) | + +## Where to Next? + +- **[Oracle AI Developer Hub](https://github.com/oracle-devrel/oracle-ai-developer-hub)** — More technical assets, samples, and projects with Oracle AI +- **[Oracle AI Vector Search docs](https://docs.oracle.com/en/database/oracle/oracle-database/26/vecse/)** — Full reference for the `VECTOR` data type, distance functions, and index types +- **[Oracle Developer Resource](https://www.oracle.com/developer/)** — Documentation, tools, and community for Oracle developers + +--- + +Built in partnership with Oracle diff --git a/workshops/semantic-search-github-issues/notebook.ipynb b/workshops/semantic-search-github-issues/notebook.ipynb new file mode 100644 index 00000000..5fffe91b --- /dev/null +++ b/workshops/semantic-search-github-issues/notebook.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Semantic Search Over GitHub Issues with Oracle AI Vector Search\n", + "\n", + "This notebook pulls real issues from a public GitHub repo, stores them as vector embeddings in Oracle AI Database 26ai using `langchain-oracledb`, and runs semantic similarity queries with metadata filters.\n", + "\n", + "**Before running:** copy `.env.example` to `.env` and fill in your FreeSQL credentials and DSN." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Connect to Oracle AI Database\n", + "\n", + "`load_dotenv()` reads your `.env` file, then we connect using `python-oracledb`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected: 23.9.0.25.7\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "import oracledb\n", + "\n", + "load_dotenv()\n", + "\n", + "connection = oracledb.connect(\n", + " user=os.environ[\"ORACLE_USER\"],\n", + " password=os.environ[\"ORACLE_PASSWORD\"],\n", + " dsn=os.environ[\"ORACLE_DSN\"],\n", + ")\n", + "print(\"Connected:\", connection.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Pull recent issues from GitHub\n", + "\n", + "Public REST API, no auth needed for this small a request. 60 calls per hour is the unauthenticated limit" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pulled 15 issues\n", + "\n", + "Example: #594 [open] t-strings for parameterized quries\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "url = \"https://api.github.com/repos/oracle/python-oracledb/issues\"\n", + "params = {\"state\": \"all\", \"per_page\": 15}\n", + "issues = requests.get(url, params=params).json()\n", + "print(f\"Pulled {len(issues)} issues\")\n", + "\n", + "# Peek at the first one\n", + "print(f\"\\nExample: #{issues[0]['number']} [{issues[0]['state']}] {issues[0]['title']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Shape issues into LangChain Documents\n", + "\n", + "Title plus body becomes the searchable content. State, labels, and issue number become metadata for filtering later." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 15 LangChain documents\n" + ] + } + ], + "source": [ + "from langchain_core.documents import Document\n", + "\n", + "docs = [\n", + " Document(\n", + " page_content=f\"{i['title']}\\n\\n{i.get('body') or ''}\",\n", + " metadata={\n", + " \"number\": i[\"number\"],\n", + " \"state\": i[\"state\"],\n", + " \"labels\": \",\".join(l[\"name\"] for l in i[\"labels\"]),\n", + " \"url\": i[\"html_url\"],\n", + " },\n", + " )\n", + " for i in issues\n", + "]\n", + "\n", + "print(f\"Built {len(docs)} LangChain documents\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load the embedding model\n", + "\n", + "`all-MiniLM-L6-v2` is small (~90MB), fast on CPU, and produces 384-dimensional vectors. The devcontainer pre-cached this during build, so this cell runs in a second or two." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/vscode/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", + "Loading weights: 100%|██████████| 103/103 [00:01<00:00, 56.70it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embedding model ready\n" + ] + } + ], + "source": [ + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "print(\"Embedding model ready\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Store as vectors in Oracle\n", + "\n", + "`OracleVS.from_documents()` does three things in one call: creates the table with a `VECTOR(384, FLOAT32)` column, generates embeddings for every document, and inserts them all." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Issues stored as vectors\n" + ] + } + ], + "source": [ + "from langchain_oracledb.vectorstores.oraclevs import OracleVS\n", + "from langchain_community.vectorstores.utils import DistanceStrategy\n", + "\n", + "# Drop the table if it exists so re-runs start clean\n", + "try:\n", + " cursor = connection.cursor()\n", + " cursor.execute(\"DROP TABLE gh_issues PURGE\")\n", + " connection.commit()\n", + "except oracledb.DatabaseError:\n", + " pass # Table didn't exist, fine\n", + "\n", + "vs = OracleVS.from_documents(\n", + " documents=docs,\n", + " embedding=embeddings,\n", + " client=connection,\n", + " table_name=\"GH_ISSUES\",\n", + " distance_strategy=DistanceStrategy.COSINE,\n", + ")\n", + "print(\"Issues stored as vectors\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Run a similarity search\n", + "\n", + "Ask for the 5 issues semantically closest to \"connection pool errors.\" Results come back ranked by cosine distance." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#576 [closed] Connection failing while connecting with create_pool_async but it works with connect function\n", + "#581 [closed] Thin mode: pool grows beyond max after TIMEDWAIT requests time out\n", + "#587 [closed] Thin async crashes with AttributeError: 'NoneType' object has no attribute 'decode' in _process_keyword_value_pairs\n", + "#579 [open] DPY-6005 / ORA-12506: All documented Thin mode IAM token approaches fail on ADB-S Private Endpoint — TOKEN_AUTH not included in TNS Connect packet\n", + "#589 [closed] IFILE path incorrectly appended to CONFIG_DIR\n" + ] + } + ], + "source": [ + "results = vs.similarity_search(\"connection pool errors\", k=5)\n", + "for r in results:\n", + " title = r.page_content.split(\"\\n\")[0]\n", + " print(f\"#{r.metadata['number']} [{r.metadata['state']}] {title}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Compare to keyword search\n", + "\n", + "Same logical question, but as a plain SQL `LIKE` query." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keyword matches: 0\n" + ] + } + ], + "source": [ + "cursor = connection.cursor()\n", + "cursor.execute(\"\"\"\n", + " SELECT id, text\n", + " FROM gh_issues\n", + " WHERE LOWER(text) LIKE '%connection pool errors%'\n", + " FETCH FIRST 5 ROWS ONLY\n", + "\"\"\")\n", + "rows = cursor.fetchall()\n", + "print(f\"Keyword matches: {len(rows)}\")\n", + "for row in rows:\n", + " print(row)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Hybrid filter: semantic + metadata\n", + "\n", + "Same semantic query, restricted to issues where `state = 'open'`. The filter pushes down into the SQL alongside the vector distance calculation." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#579 [open] DPY-6005 / ORA-12506: All documented Thin mode IAM token approaches fail on ADB-\n", + "#592 [open] Thin mode fails with ORA-24964 when trying to open a PDB migrated from Oracle 19\n", + "#593 [open] Thin mode direct path load fails when arrow type is boolean and db type is NUM_B\n", + "#594 [open] t-strings for parameterized quries\n", + "\n", + "Are there plans to add support for using t-s\n" + ] + } + ], + "source": [ + "results = vs.similarity_search(\n", + " \"connection pool errors\",\n", + " k=5,\n", + " filter={\"state\": \"open\"},\n", + ")\n", + "for r in results:\n", + " print(f\"#{r.metadata['number']} [{r.metadata['state']}] {r.page_content[:80]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Behind the abstraction: the raw SQL\n", + "\n", + "Everything `similarity_search` does is a wrapper around Oracle's native vector SQL. Here's the same query as cell 6, written as pure SQL. `VECTOR_DISTANCE` is the function, `COSINE` is the distance strategy, `ORDER BY distance` gives you nearest first, `FETCH FIRST N ROWS ONLY` is your top-k." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#576 [closed] distance=0.402\n", + "#581 [closed] distance=0.438\n", + "#587 [closed] distance=0.634\n", + "#579 [open] distance=0.671\n", + "#589 [closed] distance=0.671\n" + ] + } + ], + "source": [ + "import array\n", + "\n", + "query_vec = embeddings.embed_query(\"connection pool errors\")\n", + "query_vec = array.array(\"f\", query_vec)\n", + "\n", + "cursor = connection.cursor()\n", + "cursor.execute(\"\"\"\n", + " SELECT JSON_VALUE(metadata, '$.number') AS issue_number,\n", + " JSON_VALUE(metadata, '$.state') AS state,\n", + " VECTOR_DISTANCE(embedding, :qv, COSINE) AS distance\n", + " FROM gh_issues\n", + " ORDER BY distance\n", + " FETCH FIRST 5 ROWS ONLY\n", + "\"\"\", qv=query_vec)\n", + "\n", + "for issue_number, state, distance in cursor.fetchall():\n", + " print(f\"#{issue_number} [{state}] distance={distance:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Cleanup\n", + "\n", + "Skip this cell if you want to keep the data around to experiment with." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaned up\n" + ] + } + ], + "source": [ + "cursor = connection.cursor()\n", + "cursor.execute(\"DROP TABLE gh_issues PURGE\")\n", + "connection.commit()\n", + "print(\"Cleaned up\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}