From 78008788dee9fa4d267c3cebca0edffb294f2bb1 Mon Sep 17 00:00:00 2001 From: ShubhamSaboo Date: Thu, 12 Dec 2024 20:05:58 -0600 Subject: [PATCH] Added new demo --- .../multimodal_ai_agent/README.md | 39 +++++++++ .../multimodal_ai_agent/mutimodal_agent.py | 82 +++++++++++++++++++ .../multimodal_ai_agent/requirements.txt | 2 + 3 files changed, 123 insertions(+) create mode 100644 ai_agent_tutorials/multimodal_ai_agent/README.md create mode 100644 ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py create mode 100644 ai_agent_tutorials/multimodal_ai_agent/requirements.txt diff --git a/ai_agent_tutorials/multimodal_ai_agent/README.md b/ai_agent_tutorials/multimodal_ai_agent/README.md new file mode 100644 index 0000000..bec4ebd --- /dev/null +++ b/ai_agent_tutorials/multimodal_ai_agent/README.md @@ -0,0 +1,39 @@ +## 🧬 Multimodal AI Agent + +A Streamlit application that combines video analysis and web search capabilities using Google's Gemini 2.0 model. This agent can analyze uploaded videos and answer questions by combining visual understanding with web-search. + +### Features + +- Video analysis using Gemini 2.0 Flash +- Web research integration via DuckDuckGo +- Support for multiple video formats (MP4, MOV, AVI) +- Real-time video processing +- Combined visual and textual analysis + +### How to get Started? + +1. Clone the GitHub repository + +```bash +git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git +cd multimodal_ai_agents +``` +2. Install the required dependencies: + +```bash +pip install -r requirements.txt +``` +3. Get your Google Gemini API Key + +- Sign up for an [Google AI Studio account](https://aistudio.google.com/apikey) and obtain your API key. + +4. Set up your Gemini API Key as the environment variable + +```bash +GOOGLE_API_KEY=your_api_key_here +``` + +5. Run the Streamlit App +```bash +streamlit run multimodal_agent.py +``` \ No newline at end of file diff --git a/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py b/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py new file mode 100644 index 0000000..a2d0190 --- /dev/null +++ b/ai_agent_tutorials/multimodal_ai_agent/mutimodal_agent.py @@ -0,0 +1,82 @@ +import streamlit as st +from phi.agent import Agent +from phi.model.google import Gemini +from phi.tools.duckduckgo import DuckDuckGo +from google.generativeai import upload_file, get_file +import time +from pathlib import Path +import tempfile + +st.set_page_config( + page_title="Multimodal AI Agent", + page_icon="🧬", + layout="wide" +) + +st.title("Multimodal AI Agent 🧬") + +# Initialize single agent with both capabilities +@st.cache_resource +def initialize_agent(): + return Agent( + name="Multimodal Analyst", + model=Gemini(id="gemini-2.0-flash-exp"), + tools=[DuckDuckGo()], + markdown=True, + ) + +agent = initialize_agent() + +# File uploader +uploaded_file = st.file_uploader("Upload a video file", type=['mp4', 'mov', 'avi']) + +if uploaded_file: + with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file: + tmp_file.write(uploaded_file.read()) + video_path = tmp_file.name + + st.video(video_path) + + user_prompt = st.text_area( + "What would you like to know?", + placeholder="Ask any question related to the video - the AI Agent will analyze it and search the web if needed", + help="You can ask questions about the video content and get relevant information from the web" + ) + + if st.button("Analyze & Research"): + if not user_prompt: + st.warning("Please enter your question.") + else: + try: + with st.spinner("Processing video and researching..."): + video_file = upload_file(video_path) + while video_file.state.name == "PROCESSING": + time.sleep(2) + video_file = get_file(video_file.name) + + prompt = f""" + First analyze this video and then answer the following question using both + the video analysis and web research: {user_prompt} + + Provide a comprehensive response focusing on practical, actionable information. + """ + + result = agent.run(prompt, videos=[video_file]) + + st.subheader("Result") + st.markdown(result.content) + + except Exception as e: + st.error(f"An error occurred: {str(e)}") + finally: + Path(video_path).unlink(missing_ok=True) +else: + st.info("Please upload a video to begin analysis.") + +st.markdown(""" + + """, unsafe_allow_html=True) \ No newline at end of file diff --git a/ai_agent_tutorials/multimodal_ai_agent/requirements.txt b/ai_agent_tutorials/multimodal_ai_agent/requirements.txt new file mode 100644 index 0000000..ef3798e --- /dev/null +++ b/ai_agent_tutorials/multimodal_ai_agent/requirements.txt @@ -0,0 +1,2 @@ +phidata==2.7.2 +google-generativeai==0.8.3 \ No newline at end of file