Updated repo structure

This commit is contained in:
ShubhamSaboo
2025-04-17 16:01:23 -05:00
parent c04de2fa3a
commit ea3fe6913b
220 changed files with 4 additions and 516 deletions

View File

@@ -0,0 +1,39 @@
## 🧬 Multimodal AI Agent
A Streamlit application that combines video analysis and web search capabilities using Google's Gemini 2.0 model. This agent can analyze uploaded videos and answer questions by combining visual understanding with web-search.
### Features
- Video analysis using Gemini 2.0 Flash
- Web research integration via DuckDuckGo
- Support for multiple video formats (MP4, MOV, AVI)
- Real-time video processing
- Combined visual and textual analysis
### How to get Started?
1. Clone the GitHub repository
```bash
git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git
cd ai_agent_tutorials/multimodal_ai_agent
```
2. Install the required dependencies:
```bash
pip install -r requirements.txt
```
3. Get your Google Gemini API Key
- Sign up for an [Google AI Studio account](https://aistudio.google.com/apikey) and obtain your API key.
4. Set up your Gemini API Key as the environment variable
```bash
GOOGLE_API_KEY=your_api_key_here
```
5. Run the Streamlit App
```bash
streamlit run multimodal_agent.py
```

View File

@@ -0,0 +1,62 @@
import streamlit as st
from agno.agent import Agent
from agno.models.google import Gemini
import tempfile
import os
def main():
# Set up the reasoning agent
agent = Agent(
model=Gemini(id="gemini-2.0-flash-thinking-exp-1219"),
markdown=True
)
# Streamlit app title
st.title("Multimodal Reasoning AI Agent 🧠")
# Instruction
st.write(
"Upload an image and provide a reasoning-based task for the AI Agent. "
"The AI Agent will analyze the image and respond based on your input."
)
# File uploader for image
uploaded_file = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
try:
# Save uploaded file to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
temp_path = tmp_file.name
# Display the uploaded image
st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
# Input for dynamic task
task_input = st.text_area(
"Enter your task/question for the AI Agent:"
)
# Button to process the image and task
if st.button("Analyze Image") and task_input:
with st.spinner("AI is thinking... 🤖"):
try:
# Call the agent with the dynamic task and image path
response = agent.run(task_input, images=[temp_path])
# Display the response from the model
st.markdown("### AI Response:")
st.markdown(response.content)
except Exception as e:
st.error(f"An error occurred during analysis: {str(e)}")
finally:
# Clean up temp file
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as e:
st.error(f"An error occurred while processing the image: {str(e)}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,83 @@
import streamlit as st
from agno.agent import Agent
from agno.models.google import Gemini
from agno.media import Video
import time
from pathlib import Path
import tempfile
st.set_page_config(
page_title="Multimodal AI Agent",
page_icon="🧬",
layout="wide"
)
st.title("Multimodal AI Agent 🧬")
# Get Gemini API key from user
gemini_api_key = st.text_input("Enter your Gemini API Key", type="password")
# Initialize single agent with both capabilities
@st.cache_resource
def initialize_agent(api_key):
return Agent(
name="Multimodal Analyst",
model=Gemini(id="gemini-2.0-flash", api_key=api_key),
markdown=True,
)
if gemini_api_key:
agent = initialize_agent(gemini_api_key)
# File uploader
uploaded_file = st.file_uploader("Upload a video file", type=['mp4', 'mov', 'avi'])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
tmp_file.write(uploaded_file.read())
video_path = tmp_file.name
st.video(video_path)
user_prompt = st.text_area(
"What would you like to know?",
placeholder="Ask any question related to the video - the AI Agent will analyze it and search the web if needed",
help="You can ask questions about the video content and get relevant information from the web"
)
if st.button("Analyze & Research"):
if not user_prompt:
st.warning("Please enter your question.")
else:
try:
with st.spinner("Processing video and researching..."):
video = Video(filepath=video_path)
prompt = f"""
First analyze this video and then answer the following question using both
the video analysis and web research: {user_prompt}
Provide a comprehensive response focusing on practical, actionable information.
"""
result = agent.run(prompt, videos=[video])
st.subheader("Result")
st.markdown(result.content)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
finally:
Path(video_path).unlink(missing_ok=True)
else:
st.info("Please upload a video to begin analysis.")
else:
st.warning("Please enter your Gemini API key to continue.")
st.markdown("""
<style>
.stTextArea textarea {
height: 100px;
}
</style>
""", unsafe_allow_html=True)

View File

@@ -0,0 +1,3 @@
agno
google-generativeai==0.8.3
streamlit==1.40.2