Updated repo structure

2026-04-30 15:20:47 -05:00 · 2025-04-17 16:01:23 -05:00
parent c04de2fa3a
commit ea3fe6913b
220 changed files with 4 additions and 516 deletions
--- a/starter_ai_agents/multimodal_ai_agent/README.md
+++ b/starter_ai_agents/multimodal_ai_agent/README.md
@@ -0,0 +1,39 @@
+## 🧬 Multimodal AI Agent
+
+A Streamlit application that combines video analysis and web search capabilities using Google's Gemini 2.0 model. This agent can analyze uploaded videos and answer questions by combining visual understanding with web-search.
+
+### Features
+
+- Video analysis using Gemini 2.0 Flash
+- Web research integration via DuckDuckGo
+- Support for multiple video formats (MP4, MOV, AVI)
+- Real-time video processing
+- Combined visual and textual analysis
+
+### How to get Started?
+
+1. Clone the GitHub repository
+
+```bash
+git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git
+cd ai_agent_tutorials/multimodal_ai_agent
+```
+2. Install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+3. Get your Google Gemini API Key
+
+- Sign up for an [Google AI Studio account](https://aistudio.google.com/apikey) and obtain your API key.
+
+4. Set up your Gemini API Key as the environment variable
+
+```bash
+GOOGLE_API_KEY=your_api_key_here
+```
+
+5. Run the Streamlit App
+```bash
+streamlit run multimodal_agent.py
+```
--- a/starter_ai_agents/multimodal_ai_agent/multimodal_reasoning_agent.py
+++ b/starter_ai_agents/multimodal_ai_agent/multimodal_reasoning_agent.py
@@ -0,0 +1,62 @@
+import streamlit as st
+from agno.agent import Agent
+from agno.models.google import Gemini
+import tempfile
+import os
+
+def main():
+    # Set up the reasoning agent
+    agent = Agent(
+        model=Gemini(id="gemini-2.0-flash-thinking-exp-1219"), 
+        markdown=True
+    )
+
+    # Streamlit app title
+    st.title("Multimodal Reasoning AI Agent 🧠")
+
+    # Instruction
+    st.write(
+        "Upload an image and provide a reasoning-based task for the AI Agent. "
+        "The AI Agent will analyze the image and respond based on your input."
+    )
+
+    # File uploader for image
+    uploaded_file = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
+
+    if uploaded_file is not None:
+        try:
+            # Save uploaded file to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                temp_path = tmp_file.name
+
+            # Display the uploaded image
+            st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
+
+            # Input for dynamic task
+            task_input = st.text_area(
+                "Enter your task/question for the AI Agent:"
+            )
+
+            # Button to process the image and task
+            if st.button("Analyze Image") and task_input:
+                with st.spinner("AI is thinking... 🤖"):
+                    try:
+                        # Call the agent with the dynamic task and image path
+                        response = agent.run(task_input, images=[temp_path])
+                        
+                        # Display the response from the model
+                        st.markdown("### AI Response:")
+                        st.markdown(response.content)
+                    except Exception as e:
+                        st.error(f"An error occurred during analysis: {str(e)}")
+                    finally:
+                        # Clean up temp file
+                        if os.path.exists(temp_path):
+                            os.unlink(temp_path)
+
+        except Exception as e:
+            st.error(f"An error occurred while processing the image: {str(e)}")
+
+if __name__ == "__main__":
+    main()
--- a/starter_ai_agents/multimodal_ai_agent/mutimodal_agent.py
+++ b/starter_ai_agents/multimodal_ai_agent/mutimodal_agent.py
@@ -0,0 +1,83 @@
+import streamlit as st
+from agno.agent import Agent
+from agno.models.google import Gemini
+from agno.media import Video
+import time
+from pathlib import Path
+import tempfile
+
+st.set_page_config(
+    page_title="Multimodal AI Agent",
+    page_icon="🧬",
+    layout="wide"
+)
+
+st.title("Multimodal AI Agent 🧬")
+
+# Get Gemini API key from user
+gemini_api_key = st.text_input("Enter your Gemini API Key", type="password")
+
+# Initialize single agent with both capabilities
+@st.cache_resource
+def initialize_agent(api_key):
+    return Agent(
+        name="Multimodal Analyst",
+        model=Gemini(id="gemini-2.0-flash", api_key=api_key),
+        markdown=True,
+    )
+
+if gemini_api_key:
+    agent = initialize_agent(gemini_api_key)
+
+    # File uploader
+    uploaded_file = st.file_uploader("Upload a video file", type=['mp4', 'mov', 'avi'])
+
+    if uploaded_file:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
+            tmp_file.write(uploaded_file.read())
+            video_path = tmp_file.name
+        
+        st.video(video_path)
+        
+        user_prompt = st.text_area(
+            "What would you like to know?",
+            placeholder="Ask any question related to the video - the AI Agent will analyze it and search the web if needed",
+            help="You can ask questions about the video content and get relevant information from the web"
+        )
+        
+        if st.button("Analyze & Research"):
+            if not user_prompt:
+                st.warning("Please enter your question.")
+            else:
+                try:
+                    with st.spinner("Processing video and researching..."):
+                        video = Video(filepath=video_path)
+                        
+                        prompt = f"""
+                        First analyze this video and then answer the following question using both 
+                        the video analysis and web research: {user_prompt}
+                        
+                        Provide a comprehensive response focusing on practical, actionable information.
+                        """
+                        
+                        result = agent.run(prompt, videos=[video])
+                        
+                    st.subheader("Result")
+                    st.markdown(result.content)
+
+                except Exception as e:
+                    st.error(f"An error occurred: {str(e)}")
+                finally:
+                    Path(video_path).unlink(missing_ok=True)
+    else:
+        st.info("Please upload a video to begin analysis.")
+else:
+    st.warning("Please enter your Gemini API key to continue.")
+
+st.markdown("""
+    <style>
+    .stTextArea textarea {
+        height: 100px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
--- a/starter_ai_agents/multimodal_ai_agent/requirements.txt
+++ b/starter_ai_agents/multimodal_ai_agent/requirements.txt
@@ -0,0 +1,3 @@
+agno
+google-generativeai==0.8.3
+streamlit==1.40.2