[GH-ISSUE #2049] Embedding API could return empty embedding while using completion API from LiteLLM #26945

Open
opened 2026-04-22 03:44:24 -05:00 by GiteaMirror · 0 comments
Owner

Originally created by @James4Ever0 on GitHub (Jan 18, 2024).
Original GitHub issue: https://github.com/ollama/ollama/issues/2049

To reproduce:

Launch a LiteLLM service:

litellm --model ollama/openhermes2.5-mistral --drop_params

Call the service /completion API continuously first, meanwhile you call embedding API via Langchain, and hopefully during the very gap (very short) between each /completion call you get empty embedding from Langchain.

To call the /completion API:

import os
os.environ['OPENAI_API_KEY'] = 'any'
os.environ['OPENAI_API_BASE'] = 'http://0.0.0.0:8000'

from contextlib import contextmanager
from langchain.llms import OpenAI
import tiktoken

def print_center(banner: str):
    print(banner.center(50, "="))


class LLM:
    """
    A class for running a Language Model Chain.
    """

    def __init__(self, prompt: str, temperature=0, gpt_4=False):
        """
        Initializes the LLM class.
        Args:
            prompt (PromptTemplate): The prompt template to use.
            temperature (int): The temperature to use for the model.
            gpt_4 (bool): Whether to use GPT-4 or Text-Davinci-003.
        Side Effects:
            Sets the class attributes.
        """
        self.prompt = prompt
        self.prompt_size = self.number_of_tokens(prompt)
        self.temperature = temperature
        self.gpt_4 = gpt_4
        self.model_name = "gpt-4" if self.gpt_4 else "text-davinci-003"
        self.max_tokens = 4097 * 2 if self.gpt_4 else 4097
        self.show_init_config()

    def show_init_config(self):
        print_center("init params")
        print(f"Model: {self.model_name}")
        print(f"Max Tokens: {self.max_tokens}")
        print(f"Prompt Size: {self.prompt_size}")
        print(f"Temperature: {self.temperature}")
        print_center("init config")
        print(self.prompt)

    def run(self, query):
        """
        Runs the Language Model Chain.
        Args:
            code (str): The code to use for the chain.
            **kwargs (dict): Additional keyword arguments.
        Returns:
            str: The generated text.
        """
        llm = OpenAI(
            temperature=self.temperature,
            max_tokens=-1,
            model_name=self.model_name,
            disallowed_special=(),  # to suppress error when special tokens within the input text (encode special tokens as normal text)
        )
        # chain = LLMChain(llm=llm, prompt=self.prompt)
        chunk_list = []
        print_center("query")
        print(query)
        print_center("response")
        _input = "\n".join([self.prompt, query])
        for chunk in llm.stream(input=_input):
            print(chunk, end="", flush=True)
            chunk_list.append(chunk)
        print()

        result = "".join(chunk_list)
        return result

    def number_of_tokens(self, text):
        """
        Counts the number of tokens in a given text.
        Args:
            text (str): The text to count tokens for.
        Returns:
            int: The number of tokens in the text.
        """
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(text, disallowed_special=()))


@contextmanager
def llm_context(prompt: str, temperature=0, gpt_4=False):
    model = LLM(prompt, temperature=temperature, gpt_4=gpt_4)
    try:
        yield model
    finally:
        del model

if __name__ == "__main__":
    while True:
        with llm_context("You are a helpful assistant.") as model:
            model.run("Write me a 100 words news.")

To call the embedding API:

from langchain.embeddings import OllamaEmbeddings

ollama_emb = OllamaEmbeddings(
    model="openhermes2.5-mistral:latest",
)

while True:
    embed = ollama_emb.embed_query("Hello world")
    print("empty embedding?", embed is None)
Originally created by @James4Ever0 on GitHub (Jan 18, 2024). Original GitHub issue: https://github.com/ollama/ollama/issues/2049 To reproduce: Launch a LiteLLM service: ```bash litellm --model ollama/openhermes2.5-mistral --drop_params ``` Call the service `/completion` API continuously first, meanwhile you call embedding API via Langchain, and hopefully during the very gap (very short) between each `/completion` call you get empty embedding from Langchain. To call the `/completion` API: ```python import os os.environ['OPENAI_API_KEY'] = 'any' os.environ['OPENAI_API_BASE'] = 'http://0.0.0.0:8000' from contextlib import contextmanager from langchain.llms import OpenAI import tiktoken def print_center(banner: str): print(banner.center(50, "=")) class LLM: """ A class for running a Language Model Chain. """ def __init__(self, prompt: str, temperature=0, gpt_4=False): """ Initializes the LLM class. Args: prompt (PromptTemplate): The prompt template to use. temperature (int): The temperature to use for the model. gpt_4 (bool): Whether to use GPT-4 or Text-Davinci-003. Side Effects: Sets the class attributes. """ self.prompt = prompt self.prompt_size = self.number_of_tokens(prompt) self.temperature = temperature self.gpt_4 = gpt_4 self.model_name = "gpt-4" if self.gpt_4 else "text-davinci-003" self.max_tokens = 4097 * 2 if self.gpt_4 else 4097 self.show_init_config() def show_init_config(self): print_center("init params") print(f"Model: {self.model_name}") print(f"Max Tokens: {self.max_tokens}") print(f"Prompt Size: {self.prompt_size}") print(f"Temperature: {self.temperature}") print_center("init config") print(self.prompt) def run(self, query): """ Runs the Language Model Chain. Args: code (str): The code to use for the chain. **kwargs (dict): Additional keyword arguments. Returns: str: The generated text. """ llm = OpenAI( temperature=self.temperature, max_tokens=-1, model_name=self.model_name, disallowed_special=(), # to suppress error when special tokens within the input text (encode special tokens as normal text) ) # chain = LLMChain(llm=llm, prompt=self.prompt) chunk_list = [] print_center("query") print(query) print_center("response") _input = "\n".join([self.prompt, query]) for chunk in llm.stream(input=_input): print(chunk, end="", flush=True) chunk_list.append(chunk) print() result = "".join(chunk_list) return result def number_of_tokens(self, text): """ Counts the number of tokens in a given text. Args: text (str): The text to count tokens for. Returns: int: The number of tokens in the text. """ encoding = tiktoken.encoding_for_model("gpt-4") return len(encoding.encode(text, disallowed_special=())) @contextmanager def llm_context(prompt: str, temperature=0, gpt_4=False): model = LLM(prompt, temperature=temperature, gpt_4=gpt_4) try: yield model finally: del model if __name__ == "__main__": while True: with llm_context("You are a helpful assistant.") as model: model.run("Write me a 100 words news.") ``` To call the embedding API: ```python from langchain.embeddings import OllamaEmbeddings ollama_emb = OllamaEmbeddings( model="openhermes2.5-mistral:latest", ) while True: embed = ollama_emb.embed_query("Hello world") print("empty embedding?", embed is None) ```
GiteaMirror added the embeddingsbugapi labels 2026-04-22 03:44:24 -05:00
Sign in to join this conversation.
1 Participants
Notifications
Due Date
No due date set.
Dependencies

No dependencies set.

Reference: github-starred/ollama#26945