Janus-Pro部署和使用

# Janus-Pro部署和使用

Janus Pro 是由中国人工智能公司 DeepSeek 推出的开源多模态大模型，于 2025 年 1 月 28 日正式发布。功能包括文生图、理解图像、处理多模态指令等。本文主要关注它的图像理解功能。

# 简介

官网：Janus Pro 7B - 新一代多模态AI模型 | DeepSeek (opens new window)

Github: https://github.com/deepseek-ai/Janus.git

huggingface：deepseek-ai/Janus-Pro-7B · Hugging Face (opens new window)

魔搭社区（国内）Janus-Pro-1B · 模型库 (opens new window) | Janus-Pro-7B · 模型库 (opens new window)

# 快速开始

# 下载源码

git clone https://github.com/deepseek-ai/Janus.git

# 安装环境

# 虚拟环境
python -m venv venv
.\venv\Scripts\activate
# 清华源： -i https://pypi.tuna.tsinghua.edu.cn/simple

# 下载pytorch-cuda版（torch==2.0.1），此处固定tourch版本
pip install torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118

# 本地编译安装依赖
pip install -e .

1
2
3
4
5
6
7
8
9
10

# 运行

import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images

# 第一次会下载模型，默认缓存在C:\Users\name\.cache\huggingface\hub\models--deepseek-ai--Janus-Pro-1B
# 此处也可以指定本地目录
model_path = "deepseek-ai/Janus-Pro-1B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# 定义问题
question = "图中是什么动物？"
image = "./dog.jpg"

conversation = [
    {
        "role": "<|User|>",
        "content": f"<image_placeholder>\n{question}",
        "images": [image],
    },
    {"role": "<|Assistant|>", "content": ""},
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation, images=pil_images, force_batchify=True
).to(vl_gpt.device)

# # run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# # run the model to get the response
outputs = vl_gpt.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True,
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(f"{prepare_inputs['sft_format'][0]}", answer)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

输出：

You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.

<|User|>: <image_placeholder>
图中是什么动物？

<|Assistant|>: 图中是一只柯基犬。柯基犬是一种小型牧羊犬，以其短腿、长身和竖立的耳朵而闻名。柯基犬通常有白色和棕色的毛皮，图中的柯基犬有白色胸部和头部，以及棕色的耳朵和身体。

1
2
3
4
5
6

# 使用webui

在上面开发环境的基础上，执行下面命令：

# 安装环境
pip install -e .[gradio]

# 启动webui，启动前，建议修改模型为"deepseek-ai/Janus-Pro-1B"，防止GPU爆内存
python demo/app_januspro.py

1
2
3
4
5

# 多图片案例

使用脚本测试多张图片，并把带有结果图片，输出到out下：

import torch
from transformers import AutoModelForCausalLM, AutoConfig
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import os
from PIL import Image, ImageDraw, ImageFont
import textwrap
import glob
import re

# 设置环境变量
os.environ["TRANSFORMERS_OFFLINE"] = "1"  # 禁用在线下载

# 定义输入输出路径
input_folder = "F:\\datasets\\random_image_sets"
output_folder = "out"
os.makedirs(output_folder, exist_ok=True)

# 加载模型
model_path = "deepseek-ai/Janus-Pro-1B"
model_path = "C:\\Users\\wangxin\\.cache\\huggingface\\hub\\models--deepseek-ai--Janus-Pro-1B\\snapshots\\960ab33191f61342a4c60ae74d8dc356a39fafcb"
config = AutoConfig.from_pretrained(model_path)
language_config = config.language_config
language_config._attn_implementation = 'eager'
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path,
    language_config=language_config,
    trust_remote_code=True
)

# 设备设置
if torch.cuda.is_available():
    vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
else:
    vl_gpt = vl_gpt.to(torch.float16)

# 加载处理器和tokenizer
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 定义问题
question = "图片中人物是站立着的吗?"

# 获取所有图片文件
image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp']
image_files = []
for ext in image_extensions:
    image_files.extend(glob.glob(os.path.join(input_folder, ext)))

index = 0
#设置起始图片序号
start_index = 10
total_images = 100
# 处理每张图片
for img_path in image_files:

    index += 1
    if index < start_index:
        continue
    if index > total_images:
        break
    try:
        print(f"Processing: {img_path}")
        
        # 准备对话内容
        conversation = [
            {
                "role": "<|User|>",
                "content": f"<image_placeholder>\n{question}",
                "images": [img_path],
            },
            {"role": "<|Assistant|>", "content": ""},
        ]
        
        # 加载图像并准备输入
        pil_images = load_pil_images(conversation)
        prepare_inputs = vl_chat_processor(
            conversations=conversation, 
            images=pil_images, 
            force_batchify=True
        ).to(vl_gpt.device)
        
        # 获取图像嵌入
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
        
        # 生成回答
        outputs = vl_gpt.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=False,
            use_cache=True,
        )
        
        # 解码并提取助手回答
        full_response = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
        print(f"Full response: {full_response}")
        
        # 更健壮地提取助手回答
        assistant_matches = re.findall(r'<\|Assistant\|>(.*?)(<\||$)', full_response, re.DOTALL)
        
        if assistant_matches:
            answer = assistant_matches[0][0].strip()
        else:
            # 如果没有找到助手标签，尝试提取模型实际生成的文本
            clean_response = re.sub(r'<\|[^>]+\|>', '', full_response).strip()
            if clean_response:
                answer = clean_response
            else:
                answer = "未能获取模型回答"
        
        print(f"Extracted answer: {answer}")
        flag = False
        # 将文字写入图片
        try:
            original_img = Image.open(img_path).convert("RGB")
        except:
            print(f"Error opening image: {img_path}")
            continue
        
        # 创建可绘制的图像副本
        img_with_text = original_img.copy()
        draw = ImageDraw.Draw(img_with_text)
        
        # 计算字体大小（根据图片尺寸调整）
        img_width, img_height = original_img.size
        base_font_size = max(int(min(img_width, img_height) * 0.03), 12)
        
        try:
            # 尝试加载中文字体
            font = ImageFont.truetype("simhei.ttf", base_font_size)
        except:
            try:
                # Windows系统中文字体路径
                font = ImageFont.truetype("C:/Windows/Fonts/simhei.ttf", base_font_size)
            except:
                try:
                    # macOS系统中文字体路径
                    font = ImageFont.truetype("/Library/Fonts/Arial Unicode.ttf", base_font_size)
                except:
                    # 如果找不到中文字体，使用默认字体
                    font = ImageFont.load_default()
        
        # 文本换行处理
        max_chars_per_line = 30
        wrapped_text = textwrap.wrap(answer, width=max_chars_per_line)
        
        # 确保最多显示10行
        if len(wrapped_text) > 10:
            wrapped_text = wrapped_text[:10]
            wrapped_text[-1] = wrapped_text[-1] + "..."  # 添加省略号表示截断
        
        # 确定文本区域大小
        line_height = int(base_font_size * 1.3)
        text_height = len(wrapped_text) * line_height
        
        # 在底部创建背景条
        background_height = text_height + 20
        background_position = (0, img_height - background_height)
        background_size = (img_width, background_height)
        
        # 绘制半透明背景
        background = Image.new('RGBA', (img_width, background_height), (0, 0, 0, 180))  # 黑色带透明度
        if "是的" in answer:
            background = Image.new('RGBA', (img_width, background_height), (0, 100, 0, 180))  # 绿色带透明度
        img_with_text.paste(background, (0, img_height - background_height), background)
        
        # 添加文本
        text_position = (20, img_height - background_height + 10)
        for line in wrapped_text:
            draw.text(text_position, line, font=font, fill="white")
            text_position = (text_position[0], text_position[1] + line_height)
        
        # 保存结果
        output_path = os.path.join(output_folder, os.path.basename(img_path))
        img_with_text.save(output_path)
        print(f"Saved result to: {output_path}")
        
    except Exception as e:
        print(f"Error processing {img_path}: {str(e)}")

print("Processing completed!")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

# 微调（TODO）

TODO

编辑

#AI #CV #Python

上次更新: 2025/06/07, 21:53:36

← YOLO部署和微调 ESP32-开发环境配置→