llama.cpp https://download.csdn.net/download/klam2020/89122425
您是否对 OpenAI 模型的功能着迷,并想尝试创建一个虚假的 OpenAI 服务器用于测试或教育目的? 在本指南中,我们将引导您完成使用 llama.cpp 设置模拟 OpenAI 服务器的过程,并提供演示代码片段来帮助您入门。
首先,您需要从 GitHub 克隆 llama.cpp 存储库。 您可以这样做:
git clone https://github.com/ggerganov/llama.cpp
对于 Mac 用户:
导航到 llama.cpp 目录并运行以下命令:
cd llama.cpp && make
对于 Windows 用户:
下载 w64devkit 的最新 Fortran 版本。
在您的 PC 上解压 w64devkit 并运行 w64devkit.exe。
使用 cd 命令导航到 llama.cpp 文件夹。
运行以下命令:
make
安装所需的包
设置 llama.cpp 后,您需要安装必要的 Python 包。 运行以下命令:
pip install openai 'llama-cpp-python[server]' pydantic instructor streamlit
启动服务器
现在您已经安装了所需的组件,您可以使用不同的模型和配置启动假 OpenAI 服务器。 这里有些例子:
单一模特聊天 Single Model Chat:
python -m llama_cpp.server --model models/mistral-7b-instruct-v0.1.Q4_0.gguf
具有 GPU 卸载功能的单模型聊天 Single Model Chat with GPU Offload:
python -m llama_cpp.server --model models/mistral-7b-instruct-v0.1.Q4_0.gguf --n_gpu -1
使用 GPU 卸载的单模型函数调用 Single Model Function Calling with GPU Offload:
python -m llama_cpp.server --model models/mistral-7b-instruct-v0.1.Q4_0.gguf --n_gpu -1 --chat functionary
带配置的多模型加载 Multiple Model Load with Config:
python -m llama_cpp.server --config_file config.json
多模式模型 Multi Modal Models:
python -m llama_cpp.server --model models/llava-v1.5-7b-Q4_K.gguf --clip_model_path models/llava-v1.5-7b-mmproj-Q4_0.gguf --n_gpu -1 --chat llava-1-5
使用型号
以下是您可以尝试的一些模型:
Mistral: TheBloke/Mistral-7B-Instruct-v0.1-GGUF
Mixtral: TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF
LLaVa: jartine/llava-v1.5-7B-GGUF
通过遵循这些步骤并利用提供的演示代码,您可以使用 llama.cpp 创建模拟 OpenAI 服务器以用于实验和学习目的。 在受控环境中探索这些模型的功能,享受乐趣!
Fakeserver
1. openaitest.py
from openai import OpenAI from colorama import init from colorama import Fore, Back, Style import time init() client = OpenAI( base_url="http://localhost:8000/v1", api_key="123", ) time.sleep(5) prompts = [ "what is ROI in the context of finance, provide a worked example?", "define the efficient frontier in the context of finance", "what is glass stegal?", "how does derivative pricing work?", ] for prompt in prompts: print(Fore.LIGHTMAGENTA_EX + prompt, end="\n") response = client.chat.completions.create( model="llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", messages=[ { "role": "user", "content": prompt, } ], stream=True, max_tokens=20, ) for chunk in response: if chunk.choices[0].delta.content is not None: print( Fore.LIGHTBLUE_EX + chunk.choices[0].delta.content, end="", flush=True, ) print("\n")
2. main.py
from openai import OpenAI import streamlit as st client = OpenAI( base_url="http://localhost:8000/v1", api_key="123", ) if "messages" not in st.session_state: st.session_state["messages"] = [ { "role": "system", "content": """You are a helpful assistant. If you do not know the answer, reply I don't know don't make things up.""", } ] st.title("🚀 LLaMa CPP Python") for message in st.session_state.messages: st.chat_message(message["role"]).markdown(message["content"]) prompt = st.chat_input("Pass your input here") if prompt: st.chat_message("user").markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) response = client.chat.completions.create( model="llama.cpp/models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", messages=st.session_state.messages, stream=True, ) complete_response = "" with st.chat_message("assistant"): message_placeholder = st.empty() for chunk in response: if chunk.choices[0].delta.content is not None: complete_response += chunk.choices[0].delta.content message_placeholder.markdown(complete_response + "▌") message_placeholder.markdown(complete_response) st.session_state.messages.append( {"role": "assistant", "content": complete_response} )
3.instructor.py
from openai import OpenAI import streamlit as st import instructor from pydantic import BaseModel client = OpenAI( base_url="http://localhost:8000/v1", api_key="123", ) # Enables `response_model` client = instructor.patch(client=client) class UserDetail(BaseModel): stock_ticker: str start_date: int end_date: str if "messages" not in st.session_state: st.session_state["messages"] = [ { "role": "system", "content": """You are a helpful assistant. If you do not know the answer, reply I don't know don't make things up.""", } ] st.title("🚀 LLaMa CPP Python") for message in st.session_state.messages: st.chat_message(message["role"]).markdown(message["content"]) prompt = st.chat_input("Pass your input here") if prompt: st.chat_message("user").markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) response = client.chat.completions.create( max_tokens=-1, model="mistral-function-calling", response_model=UserDetail, messages=[ { "role": "user", "content": prompt, }, ], ) complete_response = "" with st.chat_message("assistant"): message_placeholder = st.empty() for chunk in response: st.write(chunk) st.session_state.messages.append( {"role": "assistant", "content": complete_response} )
4. multimodal.py
from openai import OpenAI import streamlit as st import instructor from pydantic import BaseModel client = OpenAI( base_url="http://localhost:8000/v1", api_key="123", ) # Enables `response_model` client = instructor.patch(client=client) class UserDetail(BaseModel): stock_ticker: str start_date: int end_date: str if "messages" not in st.session_state: st.session_state["messages"] = [ { "role": "system", "content": """You are a helpful assistant. If you do not know the answer, reply I don't know don't make things up.""", } ] st.title("🚀 LLaMa CPP Python") for message in st.session_state.messages: st.chat_message(message["role"]).markdown(message["content"]) prompt = st.chat_input("Pass your input here") if prompt: st.chat_message("user").markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) response = client.chat.completions.create( max_tokens=-1, model="mistral-function-calling", response_model=UserDetail, messages=[ { "role": "user", "content": prompt, }, ], ) complete_response = "" with st.chat_message("assistant"): message_placeholder = st.empty() for chunk in response: st.write(chunk) st.session_state.messages.append( {"role": "assistant", "content": complete_response} )
5.app.py
from openai import OpenAI # 简化应用程序框架 import streamlit as st # 带上 instructor library import instructor # 引入基础模型类 from pydantic import BaseModel # 引入股票价格功能 from stock_data import get_stock_prices # 创建一个客户端 client = OpenAI(api_key="jhjhjh1234", base_url="http://localhost:8000/v1") # 创建一个打补丁的客户端 client = instructor.patch(client=client) # 构造想要提取的内容 class ResponseModel(BaseModel): ticker: str days: int # 应用程序的标题 st.title("🚀 Fake OpenAI Server App (...llama cpp)") prompt = st.chat_input("Pass your prompt here") # 如果用户输入提示并按 Enter 键 if prompt: st.chat_message("user").markdown(prompt) # Function calling LLM call response = client.chat.completions.create( # which model we want to use model="mistral-function-calling", # pass through our prompt messages=[{"role": "user", "content": prompt}], # Add stream # stream=True, response_model=ResponseModel, ) st.chat_message("ai").markdown(response) try: prices = get_stock_prices(response.ticker, response.days) st.chat_message("ai").markdown(prices) # Summary output prompt + prices fullresponse = client.chat.completions.create( # which model we want to use model="mixtral", # pass through our prompt messages=[{"role": "user", "content": prompt + "\n" + str(prices)}], # Add stream stream=True, ) with st.chat_message("ai"): completed_message = "" message = st.empty() # Streaming the response out for chunk in fullresponse: # If the value is not none print it out if chunk.choices[0].delta.content is not None: completed_message += chunk.choices[0].delta.content message.markdown(completed_message) # print(chunk.choices[0].delta.content, flush=True, end="") except Exception as e: st.chat_message("ai").markdown("Something went wrong 😭") # with st.chat_message("ai"): # completed_message = "" # message = st.empty() # # Streaming the response out # for chunk in response: # # If the value is not none print it out # if chunk.choices[0].delta.content is not None: # completed_message += chunk.choices[0].delta.content # message.markdown(completed_message) # # print(chunk.choices[0].delta.content, flush=True, end="") # Print it out # print(response.choices[0].message.content)
6. config.json
{ "host": "0.0.0.0", "port": 8000, "models": [ { "model": "models/mistral-7b-instruct-v0.1.Q4_0.gguf", "model_alias": "mistral", "chat_format": "chatml", "n_gpu_layers": -1, "offload_kqv": true, "n_threads": 12, "n_batch": 512, "n_ctx": 2048 }, { "model": "models/mixtral-8x7b-instruct-v0.1.Q2_K.gguf", "model_alias": "mixtral", "chat_format": "chatml", "n_gpu_layers": -1, "offload_kqv": true, "n_threads": 12, "n_batch": 512, "n_ctx": 2048 }, { "model": "models/mistral-7b-instruct-v0.1.Q4_0.gguf", "model_alias": "mistral-function-calling", "chat_format": "functionary", "n_gpu_layers": -1, "offload_kqv": true, "n_threads": 12, "n_batch": 512, "n_ctx": 2048 } ] }
7. stock_data.py
import yfinance as yf import json from datetime import datetime def get_stock_prices(ticker, days): try: # Fetch stock data stock_data = yf.download(ticker, period=f"{days}d", interval="1d") # Format the DateTimeIndex to dd/mm/yyyy format stock_data.index = stock_data.index.strftime("%d/%m/%Y") # Convert to JSON format, ensuring dates are strings stock_json = stock_data.to_json(orient="index") # Parse JSON string to JSON object stock_prices = json.loads(stock_json) return stock_prices except Exception as e: return {"error": str(e)} # Example usage: ticker = "AAPL" # Example ticker days = 30 # Example number of days prices = get_stock_prices(ticker, days) print(prices)