Skip to content

Commit 7e9e9e7

Browse files
committed
adding gguf vision clip mtmd model class
1 parent 9361b11 commit 7e9e9e7

File tree

4 files changed

+1804
-3
lines changed

4 files changed

+1804
-3
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
""" GGUF Vision Model example - get started with vision-to-text using mtmd tool in conjunction with llama cpp """
3+
4+
from llmware.models import ModelCatalog
5+
6+
model_name = "qwen2.5-vl-3b-instruct-gguf"
7+
8+
# add path to local image
9+
image_file_path = "/local/path/to/jpg_or_png_image"
10+
11+
# add text prompt/instruction
12+
prompt = "Describe this image."
13+
14+
model = ModelCatalog().load_model(model_name, max_output=500)
15+
16+
# to run streaming generation
17+
for token in model.stream(prompt,image_file_path):
18+
print(token, end="")
19+
20+
# to run inference (response once completed at the end)
21+
response = model.inference(prompt,image_file_path)
22+
print("--test: inference response: ", response)

llmware/gguf_configs.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,25 @@ def whisper_log_callback(level, text, user_data):
806806
do_nothing = 0
807807

808808

809+
mtmd_log_callback = ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
810+
811+
812+
@mtmd_log_callback
813+
def mtmd_log_callback(level, text, user_data):
814+
815+
""" Controls the display log output from mtmd engine - currently exposing two options 'ON' or 'OFF' """
816+
817+
# note: reserving level and user_data as options for the future
818+
# --adapted from more sophisticated logging mechanism in llama-cpp-python
819+
# --integrated with llama_cpp_verbose logging option for integrated debugging
820+
821+
if os.environ.get("llama_cpp_verbose") != "OFF":
822+
print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
823+
else:
824+
# no action taken if verbose is if OFF
825+
do_nothing = 0
826+
827+
809828
class GGUFConfigs:
810829

811830
"""GGUFConfigs is main global configuration object for GGUF Generative Models. Most of these config items
@@ -1089,3 +1108,176 @@ class whisper_full_params(ctypes.Structure):
10891108

10901109
]
10911110

1111+
1112+
""" MTMD & CLIP GGUF Interface Configurations """
1113+
1114+
mtmd_context_p = NewType("mtmd_context_p", int)
1115+
mtmd_context_p_ctypes = ctypes.c_void_p
1116+
1117+
mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
1118+
mtmd_bitmap_p_ctypes = ctypes.c_void_p
1119+
1120+
mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
1121+
mtmd_image_tokens_p_ctypes = ctypes.c_void_p
1122+
1123+
mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
1124+
mtmd_input_chunk_p_ctypes = ctypes.c_void_p
1125+
1126+
mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
1127+
mtmd_input_chunks_p_ctypes = ctypes.c_void_p
1128+
1129+
MTMD_INPUT_CHUNK_TYPE_TEXT = 0
1130+
MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
1131+
MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
1132+
1133+
1134+
class mtmd_context_params(ctypes.Structure):
1135+
1136+
""" This interface is linked to mtmd with releases b7062+, e.g., starting ~Nov 2025 """
1137+
1138+
# if errors, look at this interface in llama.cpp/tools/mtmd/mtmd.h
1139+
# -- this api has been evolving
1140+
# -- see also class below as drop-in replacement if using a mtmd lib from before Nov 2025
1141+
1142+
_fields_ = [
1143+
("use_gpu", ctypes.c_bool),
1144+
("print_timings", ctypes.c_bool),
1145+
("n_threads", ctypes.c_int),
1146+
("image_marker", ctypes.c_char_p),
1147+
("media_marker", ctypes.c_char_p),
1148+
1149+
# verbosity removed in b7062
1150+
# ("verbosity", ctypes.c_int), # ggml_log_level
1151+
1152+
# new starting b6935
1153+
("llama_flash_attn_type", ctypes.c_int),
1154+
("warmup", ctypes.c_bool),
1155+
("image_min_tokens", ctypes.c_int),
1156+
("image_max_tokens", ctypes.c_int),
1157+
("cb_eval_user_data", ctypes.c_void_p),
1158+
("cb_eval", ggml_backend_sched_eval_callback)
1159+
]
1160+
1161+
1162+
class mtmd_context_params_alt_pre7062 (ctypes.Structure):
1163+
1164+
""" This is a deprecated interface that maps to mtmd releases in second half of 2025, up
1165+
to the b7062 release in November 2025 """
1166+
1167+
_fields_ = [
1168+
("use_gpu", ctypes.c_bool),
1169+
("print_timings", ctypes.c_bool),
1170+
("n_threads", ctypes.c_int),
1171+
("verbosity", ctypes.c_int), # ggml_log_level
1172+
("image_marker", ctypes.c_char_p),
1173+
("media_marker", ctypes.c_char_p)
1174+
]
1175+
1176+
1177+
class mtmd_input_text(ctypes.Structure):
1178+
_fields_ = [
1179+
("text", ctypes.c_char_p),
1180+
("add_special", ctypes.c_bool),
1181+
("parse_special", ctypes.c_bool),
1182+
]
1183+
1184+
1185+
def add_libmtmd_ctypes_declarations(_libmtmd):
1186+
1187+
""" Main mtmd library interfaces """
1188+
1189+
mtmd_default_marker = _libmtmd.mtmd_default_marker
1190+
mtmd_default_marker.argtypes = []
1191+
mtmd_default_marker.restype = ctypes.c_char_p
1192+
1193+
mtmd_context_params_default = _libmtmd.mtmd_context_params_default
1194+
mtmd_context_params_default.argtypes = []
1195+
mtmd_context_params_default.restype = mtmd_context_params
1196+
1197+
mtmd_init_from_file = _libmtmd.mtmd_init_from_file
1198+
mtmd_init_from_file.argtypes = [ctypes.c_char_p, llama_model_p_ctypes, mtmd_context_params]
1199+
mtmd_init_from_file.restype = mtmd_context_p_ctypes
1200+
1201+
mtmd_free = _libmtmd.mtmd_free
1202+
mtmd_free.argtypes = [mtmd_context_p_ctypes]
1203+
mtmd_free.restype = None
1204+
1205+
mtmd_support_vision = _libmtmd.mtmd_support_vision
1206+
mtmd_support_vision.argtypes = [mtmd_context_p_ctypes]
1207+
mtmd_support_vision.restype = ctypes.c_bool
1208+
1209+
mtmd_bitmap_init = _libmtmd.mtmd_bitmap_init
1210+
mtmd_bitmap_init.argtypes = [ctypes.c_uint32, ctypes.c_uint32, ctypes.POINTER(ctypes.c_uint8)]
1211+
mtmd_bitmap_init.restype = mtmd_bitmap_p_ctypes
1212+
1213+
mtmd_bitmap_free = _libmtmd.mtmd_bitmap_free
1214+
mtmd_bitmap_free.argtypes = [mtmd_bitmap_p_ctypes]
1215+
mtmd_bitmap_free.restype = None
1216+
1217+
mtmd_input_chunks_init = _libmtmd.mtmd_input_chunks_init
1218+
mtmd_input_chunks_init.argtypes = []
1219+
mtmd_input_chunks_init.restype = mtmd_input_chunks_p_ctypes
1220+
1221+
mtmd_input_chunks_free = _libmtmd.mtmd_input_chunks_free
1222+
mtmd_input_chunks_free.argtypes = [mtmd_input_chunks_p_ctypes]
1223+
mtmd_input_chunks_free.restype = None
1224+
1225+
mtmd_input_chunks_size = _libmtmd.mtmd_input_chunks_size
1226+
mtmd_input_chunks_size.argtypes = [mtmd_input_chunks_p_ctypes]
1227+
mtmd_input_chunks_size.restype = ctypes.c_size_t
1228+
1229+
mtmd_input_chunks_get = _libmtmd.mtmd_input_chunks_get
1230+
mtmd_input_chunks_get.argtypes = [mtmd_input_chunks_p_ctypes, ctypes.c_size_t]
1231+
mtmd_input_chunks_get.restype = mtmd_input_chunk_p_ctypes
1232+
1233+
mtmd_tokenize = _libmtmd.mtmd_tokenize
1234+
mtmd_tokenize.argtypes = [mtmd_context_p_ctypes, mtmd_input_chunks_p_ctypes,
1235+
ctypes.POINTER(mtmd_input_text), ctypes.POINTER(mtmd_bitmap_p_ctypes),
1236+
ctypes.c_size_t]
1237+
mtmd_tokenize.restype = ctypes.c_int
1238+
1239+
mtmd_input_chunk_get_n_tokens = _libmtmd.mtmd_input_chunk_get_n_tokens
1240+
mtmd_input_chunk_get_n_tokens.argtypes = [mtmd_input_chunk_p_ctypes]
1241+
mtmd_input_chunk_get_n_tokens.restype = ctypes.c_size_t
1242+
1243+
mtmd_input_chunk_get_type = _libmtmd.mtmd_input_chunk_get_type
1244+
mtmd_input_chunk_get_type.argtypes = [mtmd_input_chunk_p_ctypes]
1245+
mtmd_input_chunk_get_type.restype = ctypes.c_int
1246+
1247+
mtmd_input_chunk_get_tokens_text = _libmtmd.mtmd_input_chunk_get_tokens_text
1248+
mtmd_input_chunk_get_tokens_text.argtypes = [mtmd_input_chunk_p_ctypes, ctypes.POINTER(ctypes.c_size_t)]
1249+
mtmd_input_chunk_get_tokens_text.restype = ctypes.POINTER(llama_token)
1250+
1251+
# mtmd_helper_bitmap_init_from_buf
1252+
mtmd_helper_bitmap_init_from_buf = _libmtmd.mtmd_helper_bitmap_init_from_buf
1253+
mtmd_helper_bitmap_init_from_buf.argtypes = [mtmd_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), ctypes.c_size_t]
1254+
mtmd_helper_bitmap_init_from_buf.restype = mtmd_bitmap_p_ctypes
1255+
1256+
# mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname)
1257+
mtmd_helper_bitmap_init_from_file = _libmtmd.mtmd_helper_bitmap_init_from_file
1258+
mtmd_helper_bitmap_init_from_file.argtypes = [mtmd_context_p_ctypes, ctypes.c_char_p]
1259+
mtmd_helper_bitmap_init_from_file.restype = mtmd_bitmap_p_ctypes
1260+
1261+
mtmd_helper_get_n_tokens = _libmtmd.mtmd_helper_get_n_tokens
1262+
mtmd_helper_get_n_tokens.argtypes = [mtmd_input_chunks_p_ctypes]
1263+
mtmd_helper_get_n_tokens.restype = ctypes.c_size_t
1264+
1265+
mtmd_helper_eval_chunk_single = _libmtmd.mtmd_helper_eval_chunk_single
1266+
mtmd_helper_eval_chunk_single.argtypes = [mtmd_context_p_ctypes,
1267+
llama_context_p_ctypes,
1268+
mtmd_input_chunk_p_ctypes,
1269+
llama_pos, llama_seq_id,
1270+
ctypes.c_int, ctypes.c_bool, ctypes.POINTER(llama_pos)]
1271+
mtmd_helper_eval_chunk_single.restype = ctypes.c_int
1272+
1273+
# expose mtmd_helper_log_set - but catch if not found
1274+
1275+
try:
1276+
mtmd_helper_log_set = _libmtmd.mtmd_helper_log_set
1277+
mtmd_helper_log_set.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
1278+
mtmd_helper_log_set.restype = None
1279+
except:
1280+
pass
1281+
1282+
return _libmtmd
1283+

llmware/model_configs.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3741,7 +3741,33 @@
37413741
"validation_files": [],
37423742
"parameters": 4.0,
37433743
"link": "https://huggingface.co/llmware/gemma-3-4b-ov"
3744-
}
3744+
},
3745+
3746+
{"model_name": "qwen2.5-vl-3b-instruct-gguf", "display_name": "qwen2.5-vl-vision-model",
3747+
"model_family": "GGUFVisionGenerativeModel", "model_category": "generative_local",
3748+
"model_location": "llmware_repo", "context_window": 4096, "instruction_following": False,
3749+
"prompt_wrapper": "hf_chat", "temperature": 0.0, "trailing_space": "",
3750+
"gguf_repo": "llmware/qwen2.5-vl-3b-instruct-gguf",
3751+
"gguf_file": "Qwen2.5-VL-3B-Instruct-Q4_K_M.gguf",
3752+
"clip_file": "mmproj-F16.gguf",
3753+
"link": "https://huggingface.co/llmware/qwen2.5-vl-3b-instruct-gguf",
3754+
"tokenizer_local": "tokenizer_qw.json",
3755+
"fetch": {"module": "llmware.models", "method": "pull_snapshot_from_hf"},
3756+
"validation_files": [], "parameters": 3.0,
3757+
"custom_model_files": [], "custom_model_repo": ""},
3758+
3759+
{"model_name": "minicpm-2.6-gguf", "display_name": "minicpm-vision-model",
3760+
"model_family": "GGUFVisionGenerativeModel", "model_category": "generative_local",
3761+
"model_location": "llmware_repo", "context_window": 4096, "instruction_following": False,
3762+
"prompt_wrapper": "hf_chat", "temperature": 0.0, "trailing_space": "",
3763+
"gguf_repo": "llmware/minicpm-2.6-gguf",
3764+
"gguf_file": "MiniCPM-V-2_6-Q4_K_M.gguf",
3765+
"clip_file": "mmproj-model-f16-2.gguf",
3766+
"link": "https://huggingface.co/llmware/minicpm-2.6-gguf",
3767+
"tokenizer_local": "tokenizer_qw.json",
3768+
"fetch": {"module": "llmware.models", "method": "pull_snapshot_from_hf"},
3769+
"validation_files": [], "parameters": 7.0,
3770+
"custom_model_files": [], "custom_model_repo": ""}
37453771

37463772
]
37473773

0 commit comments

Comments
 (0)