Voice Enabled RAG Agent with OpenAI Agents SDK

2026-03-12 01:57:58 -05:00 · 2025-03-25 05:12:20 +05:30
parent fc02a7f3dd
commit bdfd184feb
3 changed files with 477 additions and 0 deletions
--- a/rag_tutorials/voice_rag_openaisdk/README.md
+++ b/rag_tutorials/voice_rag_openaisdk/README.md
@@ -0,0 +1,68 @@
+## 🎙️ Voice RAG with OpenAI SDK
+
+This script demonstrates how to build a voice-enabled Retrieval-Augmented Generation (RAG) system using OpenAI's SDK and Streamlit. The application allows users to upload PDF documents, ask questions, and receive both text and voice responses using OpenAI's text-to-speech capabilities.
+
+### Features
+
+- Creates a voice-enabled RAG system using OpenAI's SDK
+- Supports PDF document processing and chunking
+- Uses Qdrant as the vector database for efficient similarity search
+- Implements real-time text-to-speech with multiple voice options
+- Provides a user-friendly Streamlit interface
+- Allows downloading of generated audio responses
+- Supports multiple document uploads and tracking
+
+### How to get Started?
+
+1. Clone the GitHub repository
+```bash
+git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git
+cd awesome-llm-apps/rag_tutorials/voice_rag_openaisdk
+```
+
+2. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+3. Set up your API keys:
+- Get your [OpenAI API key](https://platform.openai.com/)
+- Set up a [Qdrant Cloud](https://cloud.qdrant.io/) account and get your API key and URL
+- Create a `.env` file with your credentials:
+```bash
+OPENAI_API_KEY='your-openai-api-key'
+QDRANT_URL='your-qdrant-url'
+QDRANT_API_KEY='your-qdrant-api-key'
+```
+
+4. Run the Voice RAG application:
+```bash
+streamlit run rag_voice.py
+```
+
+5. Open your web browser and navigate to the URL provided in the console output to interact with the Voice RAG system.
+
+### How it works?
+
+1. **Document Processing:** 
+   - Upload PDF documents through the Streamlit interface
+   - Documents are split into chunks using LangChain's RecursiveCharacterTextSplitter
+   - Each chunk is embedded using FastEmbed and stored in Qdrant
+
+2. **Query Processing:**
+   - User questions are converted to embeddings
+   - Similar documents are retrieved from Qdrant
+   - A processing agent generates a clear, spoken-word friendly response
+   - A TTS agent optimizes the response for speech synthesis
+
+3. **Voice Generation:**
+   - Text responses are converted to speech using OpenAI's TTS
+   - Users can choose from multiple voice options
+   - Audio can be played directly or downloaded as MP3
+
+4. **Features:**
+   - Real-time audio streaming
+   - Multiple voice personality options
+   - Document source tracking
+   - Download capability for audio responses
+   - Progress tracking for document processing
--- a/rag_tutorials/voice_rag_openaisdk/rag_voice.py
+++ b/rag_tutorials/voice_rag_openaisdk/rag_voice.py
@@ -0,0 +1,401 @@
+from typing import List, Dict, Optional, Tuple
+import os
+import tempfile
+from datetime import datetime
+import uuid
+import asyncio
+
+import streamlit as st
+from dotenv import load_dotenv
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from qdrant_client.http.models import Distance, VectorParams
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from fastembed import TextEmbedding
+from openai import AsyncOpenAI
+from openai.helpers import LocalAudioPlayer
+from agents import Agent, Runner
+
+load_dotenv()
+
+# Constants
+COLLECTION_NAME = "voice-rag-agent"
+
+def init_session_state() -> None:
+    """Initialize Streamlit session state with default values."""
+    defaults = {
+        "initialized": False,
+        "qdrant_url": "",
+        "qdrant_api_key": "",
+        "openai_api_key": "",
+        "setup_complete": False,
+        "client": None,
+        "embedding_model": None,
+        "processor_agent": None,
+        "tts_agent": None,
+        "selected_voice": "coral",
+        "processed_documents": []
+    }
+    
+    for key, value in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = value
+
+def setup_sidebar() -> None:
+    """Configure sidebar with API settings and voice options."""
+    with st.sidebar:
+        st.title("🔑 Configuration")
+        st.markdown("---")
+        
+        st.session_state.qdrant_url = st.text_input(
+            "Qdrant URL",
+            value=st.session_state.qdrant_url,
+            type="password"
+        )
+        st.session_state.qdrant_api_key = st.text_input(
+            "Qdrant API Key",
+            value=st.session_state.qdrant_api_key,
+            type="password"
+        )
+        st.session_state.openai_api_key = st.text_input(
+            "OpenAI API Key",
+            value=st.session_state.openai_api_key,
+            type="password"
+        )
+        
+        st.markdown("---")
+        st.markdown("### 🎤 Voice Settings")
+        voices = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"]
+        st.session_state.selected_voice = st.selectbox(
+            "Select Voice",
+            options=voices,
+            index=voices.index(st.session_state.selected_voice),
+            help="Choose the voice for the audio response"
+        )
+
+def setup_qdrant() -> Tuple[QdrantClient, TextEmbedding]:
+    """Initialize Qdrant client and embedding model."""
+    if not all([st.session_state.qdrant_url, st.session_state.qdrant_api_key]):
+        raise ValueError("Qdrant credentials not provided")
+    
+    client = QdrantClient(
+        url=st.session_state.qdrant_url,
+        api_key=st.session_state.qdrant_api_key
+    )
+    
+    embedding_model = TextEmbedding()
+    test_embedding = list(embedding_model.embed(["test"]))[0]
+    embedding_dim = len(test_embedding)
+    
+    try:
+        client.create_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(
+                size=embedding_dim,
+                distance=Distance.COSINE
+            )
+        )
+    except Exception as e:
+        if "already exists" not in str(e):
+            raise e
+    
+    return client, embedding_model
+
+def process_pdf(file) -> List:
+    """Process PDF file and split into chunks with metadata."""
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(file.getvalue())
+            loader = PyPDFLoader(tmp_file.name)
+            documents = loader.load()
+            
+            # Add source metadata
+            for doc in documents:
+                doc.metadata.update({
+                    "source_type": "pdf",
+                    "file_name": file.name,
+                    "timestamp": datetime.now().isoformat()
+                })
+            
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000,
+                chunk_overlap=200
+            )
+            return text_splitter.split_documents(documents)
+    except Exception as e:
+        st.error(f"📄 PDF processing error: {str(e)}")
+        return []
+
+def store_embeddings(
+    client: QdrantClient,
+    embedding_model: TextEmbedding,
+    documents: List,
+    collection_name: str
+) -> None:
+    """Store document embeddings in Qdrant."""
+    for doc in documents:
+        embedding = list(embedding_model.embed([doc.page_content]))[0]
+        client.upsert(
+            collection_name=collection_name,
+            points=[
+                models.PointStruct(
+                    id=str(uuid.uuid4()),
+                    vector=embedding.tolist(),
+                    payload={
+                        "content": doc.page_content,
+                        **doc.metadata
+                    }
+                )
+            ]
+        )
+
+def setup_agents(openai_api_key: str) -> Tuple[Agent, Agent]:
+    """Initialize the processor and TTS agents."""
+    os.environ["OPENAI_API_KEY"] = openai_api_key
+    
+    processor_agent = Agent(
+        name="Documentation Processor",
+        instructions="""You are a helpful documentation assistant. Your task is to:
+        1. Analyze the provided documentation content
+        2. Answer the user's question clearly and concisely
+        3. Include relevant examples when available
+        4. Cite the source files when referencing specific content
+        5. Keep responses natural and conversational
+        6. Format your response in a way that's easy to speak out loud""",
+        model="gpt-4o"
+    )
+
+    tts_agent = Agent(
+        name="Text-to-Speech Agent",
+        instructions="""You are a text-to-speech agent. Your task is to:
+        1. Convert the processed documentation response into natural speech
+        2. Maintain proper pacing and emphasis
+        3. Handle technical terms clearly
+        4. Keep the tone professional but friendly
+        5. Use appropriate pauses for better comprehension
+        6. Ensure the speech is clear and well-articulated""",
+        model="gpt-4o"
+    )
+    
+    return processor_agent, tts_agent
+
+async def process_query(
+    query: str,
+    client: QdrantClient,
+    embedding_model: TextEmbedding,
+    collection_name: str,
+    openai_api_key: str,
+    voice: str
+) -> Dict:
+    """Process user query and generate voice response."""
+    try:
+        st.info("🔄 Step 1: Generating query embedding and searching documents...")
+        # Get query embedding and search
+        query_embedding = list(embedding_model.embed([query]))[0]
+        st.write(f"Generated embedding of size: {len(query_embedding)}")
+        
+        search_response = client.query_points(
+            collection_name=collection_name,
+            query=query_embedding.tolist(),
+            limit=3,
+            with_payload=True
+        )
+        
+        search_results = search_response.points if hasattr(search_response, 'points') else []
+        st.write(f"Found {len(search_results)} relevant documents")
+        
+        if not search_results:
+            raise Exception("No relevant documents found in the vector database")
+        
+        st.info("🔄 Step 2: Preparing context from search results...")
+        # Prepare context from search results
+        context = "Based on the following documentation:\n\n"
+        for i, result in enumerate(search_results, 1):
+            payload = result.payload
+            if not payload:
+                continue
+            content = payload.get('content', '')
+            source = payload.get('file_name', 'Unknown Source')
+            context += f"From {source}:\n{content}\n\n"
+            st.write(f"Document {i} from: {source}")
+        
+        context += f"\nUser Question: {query}\n\n"
+        context += "Please provide a clear, concise answer that can be easily spoken out loud."
+        
+        st.info("🔄 Step 3: Setting up agents...")
+        # Setup agents if not already done
+        if not st.session_state.processor_agent or not st.session_state.tts_agent:
+            processor_agent, tts_agent = setup_agents(openai_api_key)
+            st.session_state.processor_agent = processor_agent
+            st.session_state.tts_agent = tts_agent
+            st.write("Initialized new processor and TTS agents")
+        else:
+            st.write("Using existing agents")
+        
+        st.info("🔄 Step 4: Generating text response...")
+        # Generate text response using processor agent
+        processor_result = await Runner.run(st.session_state.processor_agent, context)
+        text_response = processor_result.final_output
+        st.write(f"Generated text response of length: {len(text_response)}")
+        
+        st.info("🔄 Step 5: Generating voice instructions...")
+        # Generate voice instructions using TTS agent
+        tts_result = await Runner.run(st.session_state.tts_agent, text_response)
+        voice_instructions = tts_result.final_output
+        st.write(f"Generated voice instructions of length: {len(voice_instructions)}")
+        
+        st.info("🔄 Step 6: Generating and playing audio...")
+        # Generate and play audio with streaming
+        async_openai = AsyncOpenAI(api_key=openai_api_key)
+        
+        # First create streaming response
+        async with async_openai.audio.speech.with_streaming_response.create(
+            model="gpt-4o-mini-tts",
+            voice=voice,
+            input=text_response,
+            instructions=voice_instructions,
+            response_format="pcm",
+        ) as stream_response:
+            st.write("Starting audio playback...")
+            # Play audio directly using LocalAudioPlayer
+            await LocalAudioPlayer().play(stream_response)
+            st.write("Audio playback complete")
+            
+            st.write("Generating downloadable MP3 version...")
+            # Also save as MP3 for download
+            audio_response = await async_openai.audio.speech.create(
+                model="gpt-4o-mini-tts",
+                voice=voice,
+                input=text_response,
+                instructions=voice_instructions,
+                response_format="mp3"
+            )
+            
+            temp_dir = tempfile.gettempdir()
+            audio_path = os.path.join(temp_dir, f"response_{uuid.uuid4()}.mp3")
+            
+            with open(audio_path, "wb") as f:
+                f.write(audio_response.content)
+            st.write(f"Saved MP3 file to: {audio_path}")
+        
+        st.success("✅ Query processing complete!")
+        return {
+            "status": "success",
+            "text_response": text_response,
+            "voice_instructions": voice_instructions,
+            "audio_path": audio_path,
+            "sources": [r.payload.get('file_name', 'Unknown Source') for r in search_results if r.payload]
+        }
+    
+    except Exception as e:
+        st.error(f"❌ Error during query processing: {str(e)}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "query": query
+        }
+
+def main() -> None:
+    """Main application function."""
+    st.set_page_config(
+        page_title="Voice RAG Agent",
+        page_icon="🎙️",
+        layout="wide"
+    )
+    
+    init_session_state()
+    setup_sidebar()
+    
+    st.title("🎙️ Voice RAG Agent")
+    st.info("Get voice-powered answers to your documentation questions by configuring your API keys and uploading PDF documents. Then, simply ask questions to receive both text and voice responses!")
+    
+    # File upload section
+    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
+    
+    if uploaded_file:
+        file_name = uploaded_file.name
+        if file_name not in st.session_state.processed_documents:
+            with st.spinner('Processing PDF...'):
+                try:
+                    # Setup Qdrant if not already done
+                    if not st.session_state.client:
+                        client, embedding_model = setup_qdrant()
+                        st.session_state.client = client
+                        st.session_state.embedding_model = embedding_model
+                    
+                    # Process and store document
+                    documents = process_pdf(uploaded_file)
+                    if documents:
+                        store_embeddings(
+                            st.session_state.client,
+                            st.session_state.embedding_model,
+                            documents,
+                            COLLECTION_NAME
+                        )
+                        st.session_state.processed_documents.append(file_name)
+                        st.success(f"✅ Added PDF: {file_name}")
+                        st.session_state.setup_complete = True
+                except Exception as e:
+                    st.error(f"Error processing document: {str(e)}")
+    
+    # Display processed documents
+    if st.session_state.processed_documents:
+        st.sidebar.header("📚 Processed Documents")
+        for doc in st.session_state.processed_documents:
+            st.sidebar.text(f"📄 {doc}")
+    
+    # Query interface
+    query = st.text_input(
+        "What would you like to know about the documentation?",
+        placeholder="e.g., How do I authenticate API requests?",
+        disabled=not st.session_state.setup_complete
+    )
+    
+    if query and st.session_state.setup_complete:
+        with st.status("Processing your query...", expanded=True) as status:
+            try:
+                result = asyncio.run(process_query(
+                    query,
+                    st.session_state.client,
+                    st.session_state.embedding_model,
+                    COLLECTION_NAME,
+                    st.session_state.openai_api_key,
+                    st.session_state.selected_voice
+                ))
+                
+                if result["status"] == "success":
+                    status.update(label="✅ Query processed!", state="complete")
+                    
+                    st.markdown("### Response:")
+                    st.write(result["text_response"])
+                    
+                    if "audio_path" in result:
+                        st.markdown(f"### 🔊 Audio Response (Voice: {st.session_state.selected_voice})")
+                        st.audio(result["audio_path"], format="audio/mp3", start_time=0)
+                        
+                        with open(result["audio_path"], "rb") as audio_file:
+                            audio_bytes = audio_file.read()
+                            st.download_button(
+                                label="📥 Download Audio Response",
+                                data=audio_bytes,
+                                file_name=f"voice_response_{st.session_state.selected_voice}.mp3",
+                                mime="audio/mp3"
+                            )
+                    
+                    st.markdown("### Sources:")
+                    for source in result["sources"]:
+                        st.markdown(f"- {source}")
+                else:
+                    status.update(label="❌ Error processing query", state="error")
+                    st.error(f"Error: {result.get('error', 'Unknown error occurred')}")
+            
+            except Exception as e:
+                status.update(label="❌ Error processing query", state="error")
+                st.error(f"Error processing query: {str(e)}")
+    
+    elif not st.session_state.setup_complete:
+        st.info("👈 Please configure the system and upload documents first!")
+
+if __name__ == "__main__":
+    main()
--- a/rag_tutorials/voice_rag_openaisdk/requirements.txt
+++ b/rag_tutorials/voice_rag_openaisdk/requirements.txt
@@ -0,0 +1,8 @@
+openai-agents
+streamlit
+qdrant-client
+fastembed
+langchain
+langchain-community
+langchain-openai
+openai