From 78008788dee9fa4d267c3cebca0edffb294f2bb1 Mon Sep 17 00:00:00 2001
From: ShubhamSaboo <shubhamsaboo111@gmail.com>
Date: Thu, 12 Dec 2024 20:05:58 -0600
Subject: [PATCH] Added new demo

---
 .../multimodal_ai_agent/README.md             | 39 +++++++++
 .../multimodal_ai_agent/mutimodal_agent.py    | 82 +++++++++++++++++++
 .../multimodal_ai_agent/requirements.txt      |  2 +
 3 files changed, 123 insertions(+)
 create mode 100644 ai_agent_tutorials/multimodal_ai_agent/README.md
 create mode 100644 ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py
 create mode 100644 ai_agent_tutorials/multimodal_ai_agent/requirements.txt

diff --git a/ai_agent_tutorials/multimodal_ai_agent/README.md b/ai_agent_tutorials/multimodal_ai_agent/README.md
new file mode 100644
index 0000000..bec4ebd
--- /dev/null
+++ b/ai_agent_tutorials/multimodal_ai_agent/README.md
@@ -0,0 +1,39 @@
+## 🧬 Multimodal AI Agent
+
+A Streamlit application that combines video analysis and web search capabilities using Google's Gemini 2.0 model. This agent can analyze uploaded videos and answer questions by combining visual understanding with web-search.
+
+### Features
+
+- Video analysis using Gemini 2.0 Flash
+- Web research integration via DuckDuckGo
+- Support for multiple video formats (MP4, MOV, AVI)
+- Real-time video processing
+- Combined visual and textual analysis
+
+### How to get Started?
+
+1. Clone the GitHub repository
+
+```bash
+git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git
+cd multimodal_ai_agents
+```
+2. Install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+3. Get your Google Gemini API Key
+
+- Sign up for an [Google AI Studio account](https://aistudio.google.com/apikey) and obtain your API key.
+
+4. Set up your Gemini API Key as the environment variable
+
+```bash
+GOOGLE_API_KEY=your_api_key_here
+```
+
+5. Run the Streamlit App
+```bash
+streamlit run multimodal_agent.py
+```
\ No newline at end of file
diff --git a/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py b/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py
new file mode 100644
index 0000000..a2d0190
--- /dev/null
+++ b/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py
@@ -0,0 +1,82 @@
+import streamlit as st
+from phi.agent import Agent
+from phi.model.google import Gemini
+from phi.tools.duckduckgo import DuckDuckGo
+from google.generativeai import upload_file, get_file
+import time
+from pathlib import Path
+import tempfile
+
+st.set_page_config(
+    page_title="Multimodal AI Agent",
+    page_icon="🧬",
+    layout="wide"
+)
+
+st.title("Multimodal AI Agent 🧬")
+
+# Initialize single agent with both capabilities
+@st.cache_resource
+def initialize_agent():
+    return Agent(
+        name="Multimodal Analyst",
+        model=Gemini(id="gemini-2.0-flash-exp"),
+        tools=[DuckDuckGo()],
+        markdown=True,
+    )
+
+agent = initialize_agent()
+
+# File uploader
+uploaded_file = st.file_uploader("Upload a video file", type=['mp4', 'mov', 'avi'])
+
+if uploaded_file:
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
+        tmp_file.write(uploaded_file.read())
+        video_path = tmp_file.name
+    
+    st.video(video_path)
+    
+    user_prompt = st.text_area(
+        "What would you like to know?",
+        placeholder="Ask any question related to the video - the AI Agent will analyze it and search the web if needed",
+        help="You can ask questions about the video content and get relevant information from the web"
+    )
+    
+    if st.button("Analyze & Research"):
+        if not user_prompt:
+            st.warning("Please enter your question.")
+        else:
+            try:
+                with st.spinner("Processing video and researching..."):
+                    video_file = upload_file(video_path)
+                    while video_file.state.name == "PROCESSING":
+                        time.sleep(2)
+                        video_file = get_file(video_file.name)
+
+                    prompt = f"""
+                    First analyze this video and then answer the following question using both 
+                    the video analysis and web research: {user_prompt}
+                    
+                    Provide a comprehensive response focusing on practical, actionable information.
+                    """
+                    
+                    result = agent.run(prompt, videos=[video_file])
+                    
+                st.subheader("Result")
+                st.markdown(result.content)
+
+            except Exception as e:
+                st.error(f"An error occurred: {str(e)}")
+            finally:
+                Path(video_path).unlink(missing_ok=True)
+else:
+    st.info("Please upload a video to begin analysis.")
+
+st.markdown("""
+    <style>
+    .stTextArea textarea {
+        height: 100px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
\ No newline at end of file
diff --git a/ai_agent_tutorials/multimodal_ai_agent/requirements.txt b/ai_agent_tutorials/multimodal_ai_agent/requirements.txt
new file mode 100644
index 0000000..ef3798e
--- /dev/null
+++ b/ai_agent_tutorials/multimodal_ai_agent/requirements.txt
@@ -0,0 +1,2 @@
+phidata==2.7.2
+google-generativeai==0.8.3
\ No newline at end of file