文章目录
本地LLM的图像和视频理解
bash
开始实时分析(中文输出),按 ESC 退出
[0] 书籍的堆叠放在一排架子上,最上面的是最高的那一层,最低是最低的那一层,背景是一面白色的墙。 (23451 ms)
书籍的堆叠放在一排架子上,最上面的是最高的那一层,最低是最低的那一层,背景是一面白色的墙。[30] 地板上放着一桶水和一个白色包,前面是一张桌,桌上有一个堆满了书籍的书架前坐着一个人。 (22819 ms)
地板上放着一桶水和一个白色包,前面是一张桌,桌上有一个堆满了书籍的书架前坐着一个人。[60] 花盆和书架旁,放着一盏台灯与键盘的办公桌。 (21536 ms)
花盆和书架旁,放着一盏台灯与键盘的办公桌。^C
完整代码 (vlm_analysis_translated.cpp)
cpp
#include <iostream>
#include <string>
#include <vector>
#include <chrono>
#include <thread>
#include <opencv2/opencv.hpp>
#include <curl/curl.h>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
// 回调函数:libcurl 写入数据
size_t WriteCallback(void* contents, size_t size, size_t nmemb, std::string* output) {
size_t totalSize = size * nmemb;
output->append((char*)contents, totalSize);
return totalSize;
}
// 将 OpenCV Mat (BGR) 编码为 JPEG 并转成 Base64
std::string matToBase64(const cv::Mat& frame, int quality = 70) {
std::vector<uchar> buf;
std::vector<int> params = {cv::IMWRITE_JPEG_QUALITY, quality};
cv::imencode(".jpg", frame, buf, params);
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
std::string base64;
int i = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
for (size_t idx = 0; idx < buf.size(); idx++) {
char_array_3[i++] = buf[idx];
if (i == 3) {
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for (int j = 0; j < 4; j++)
base64 += base64_chars[char_array_4[j]];
i = 0;
}
}
if (i) {
for (int j = i; j < 3; j++)
char_array_3[j] = '\0';
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
for (int j = 0; j < i + 1; j++)
base64 += base64_chars[char_array_4[j]];
while (i++ < 3)
base64 += '=';
}
return base64;
}
// 调用 Ollama API 生成文本(不包含图像)
std::string chatOllama(const std::string& model, const std::string& prompt) {
CURL* curl = curl_easy_init();
if (!curl) return "curl init failed";
json req;
req["model"] = model;
req["prompt"] = prompt;
req["stream"] = false;
std::string reqStr = req.dump();
std::string response;
curl_easy_setopt(curl, CURLOPT_URL, "http://localhost:11434/api/generate");
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, reqStr.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
struct curl_slist* headers = nullptr;
headers = curl_slist_append(headers, "Content-Type: application/json");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
CURLcode res = curl_easy_perform(curl);
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
if (res != CURLE_OK) {
return "curl error: " + std::string(curl_easy_strerror(res));
}
try {
json resp = json::parse(response);
if (resp.contains("response")) {
return resp["response"].get<std::string>();
} else {
return "no response field";
}
} catch (...) {
return "json parse error";
}
}
// 调用 Ollama API 进行视觉分析(图像+文本)
std::string askVisionOllama(const std::string& model, const std::string& prompt, const std::string& imageBase64) {
CURL* curl = curl_easy_init();
if (!curl) return "curl init failed";
json req;
req["model"] = model;
req["prompt"] = prompt;
req["stream"] = false;
req["images"] = {imageBase64};
std::string reqStr = req.dump();
std::string response;
curl_easy_setopt(curl, CURLOPT_URL, "http://localhost:11434/api/generate");
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, reqStr.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
struct curl_slist* headers = nullptr;
headers = curl_slist_append(headers, "Content-Type: application/json");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
CURLcode res = curl_easy_perform(curl);
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
if (res != CURLE_OK) {
return "curl error: " + std::string(curl_easy_strerror(res));
}
try {
json resp = json::parse(response);
if (resp.contains("response")) {
return resp["response"].get<std::string>();
} else {
return "no response field";
}
} catch (...) {
return "json parse error";
}
}
int main() {
// 模型配置
const std::string visionModel = "moondream"; // 视觉模型(输出英文)
const std::string translateModel = "qwen2.5:1.5b"; // 翻译模型(中英文)
const std::string visionPrompt = "What is in this image? Describe it in one short sentence.";
const int processInterval = 30; // 每30帧分析一次
// 初始化摄像头
cv::VideoCapture cap(0);
if (!cap.isOpened()) {
std::cerr << "无法打开摄像头" << std::endl;
return -1;
}
cap.set(cv::CAP_PROP_FRAME_WIDTH, 640);
cap.set(cv::CAP_PROP_FRAME_HEIGHT, 480);
cv::Mat frame;
int frameCount = 0;
std::string lastResult = "等待第一帧分析...";
std::cout << "开始实时分析(中文输出),按 ESC 退出" << std::endl;
while (true) {
cap >> frame;
if (frame.empty()) break;
cv::imshow("VLM Analysis (Chinese Output)", frame);
if (frameCount % processInterval == 0) {
auto start = std::chrono::steady_clock::now();
// 1. 编码图像
std::string imgBase64 = matToBase64(frame);
// 2. 调用视觉模型获取英文描述
std::string englishDesc = askVisionOllama(visionModel, visionPrompt, imgBase64);
if (englishDesc.find("error") != std::string::npos) {
lastResult = "视觉模型错误: " + englishDesc;
std::cerr << lastResult << std::endl;
} else {
// 3. 调用翻译模型将英文翻译成中文
std::string translatePrompt = "将以下英文翻译成中文,只输出中文翻译,不要有其他解释:\n" + englishDesc;
std::string chineseDesc = chatOllama(translateModel, translatePrompt);
if (chineseDesc.find("error") != std::string::npos) {
lastResult = "翻译错误: " + chineseDesc;
std::cerr << lastResult << std::endl;
} else {
lastResult = chineseDesc;
}
}
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "[" << frameCount << "] " << lastResult << " (" << ms << " ms)" << std::endl;
} else {
// 非分析帧,显示上一次结果
std::cout << "\r" << lastResult << std::flush;
}
frameCount++;
int key = cv::waitKey(1);
if (key == 27) break; // ESC
}
cap.release();
cv::destroyAllWindows();
return 0;
}
安装必要模型
在运行程序前,请确保已下载所需的两个模型:
bash
# 下载视觉模型(英文描述)
ollama pull moondream
# 下载轻量翻译模型
ollama pull qwen2.5:1.5b
编译与运行
bash
# 编译
g++ -std=c++17 vlm_analysis_translated.cpp -o vlm_analysis_translated \
`pkg-config --cflags --libs opencv4` \
-lcurl -lpthread
# 运行
./vlm_analysis_translated