feat: support vision (#249)

* feat: support vision * clippy * implement vision * resolve data url to local file * add model openai:gpt-4-vision-preview * use newline to concate embeded text files * set max_tokens for gpt-4-vision-preview
sigoden · Nov 27, 2023 · 35c7550 · 35c7550
1 parent 5bfe95d
commit 35c7550
Show file tree

Hide file tree

Showing 19 changed files with 494 additions and 108 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -41,6 +41,8 @@ reqwest-eventsource = "0.5.0"
 simplelog = "0.12.1"
 log = "0.4.20"
 shell-words = "1.1.0"
+mime_guess = "2.0.4"
+sha2 = "0.10.8"
 
 [dependencies.reqwest]
 version = "0.11.14"

diff --git a/README.md b/README.md
@@ -39,6 +39,7 @@ Download it from [GitHub Releases](https://github.com/sigoden/aichat/releases),
 - Support chat and command modes
 - Use [Roles](#roles)
 - Powerful [Chat REPL](#chat-repl)
+- Support vision
 - Context-aware conversation/session
 - Syntax highlighting markdown and 200 other languages
 - Stream output with hand-typing effect
@@ -147,9 +148,9 @@ The Chat REPL supports:
 .session                 Start a context-aware chat session
 .info session            Show session info
 .exit session            End the current session
+.file                    Attach files to the message and then submit it
 .set                     Modify the configuration parameters
 .copy                    Copy the last reply to the clipboard
-.read                    Read files into the message and submit
 .exit                    Exit the REPL
 
 Type ::: to begin multi-line editing, type ::: to end it.
@@ -255,6 +256,17 @@ The prompt on the right side is about the current usage of tokens and the propor
 compared to the maximum number of tokens allowed by the model.
 
 
+### `.file` - attach files to the message 
+
+```
+Usage: .file <file>... [-- text...]
+
+.file message.txt
+.file config.yaml -- convert to toml
+.file a.jpg b.jpg -- What’s in these images?
+.file https://ibb.co/a.png https://ibb.co/b.png -- what is the difference?
+```
+
 ### `.set` - modify the configuration temporarily
 
 ```
@@ -277,6 +289,7 @@ Options:
   -m, --model <MODEL>        Choose a LLM model
   -r, --role <ROLE>          Choose a role
   -s, --session [<SESSION>]  Create or reuse a session
+  -f, --file <FILE>...       Attach files to the message to be sent
   -H, --no-highlight         Disable syntax highlighting
   -S, --no-stream            No stream output
   -w, --wrap <WRAP>          Specify the text-wrapping mode (no*, auto, <max-width>)
@@ -306,6 +319,9 @@ cat config.json | aichat convert to yaml     # Read stdin
 cat config.json | aichat -r convert:yaml     # Read stdin with a role
 cat config.json | aichat -s i18n             # Read stdin with a session
 
+aichat --file a.png b.png -- diff images     # Attach files
+aichat --file screenshot.png -r ocr          # Attach files with a role
+
 aichat --list-models                         # List all available models
 aichat --list-roles                          # List all available roles
 aichat --list-sessions                       # List all available models

diff --git a/src/cli.rs b/src/cli.rs
@@ -12,6 +12,9 @@ pub struct Cli {
     /// Create or reuse a session
     #[clap(short = 's', long)]
     pub session: Option<Option<String>>,
+    /// Attach files to the message to be sent.
+    #[clap(short = 'f', long, num_args = 1.., value_name = "FILE")]
+    pub file: Option<Vec<String>>,
     /// Disable syntax highlighting
     #[clap(short = 'H', long)]
     pub no_highlight: bool,

diff --git a/src/client/common.rs b/src/client/common.rs
@@ -1,7 +1,7 @@
 use super::{openai::OpenAIConfig, ClientConfig, Message};
 
 use crate::{
-    config::GlobalConfig,
+    config::{GlobalConfig, Input},
     render::ReplyHandler,
     utils::{
         init_tokio_runtime, prompt_input_integer, prompt_input_string, tokenize, AbortSignal,
@@ -50,7 +50,7 @@ macro_rules! register_client {
             }
 
             impl $client {
-                pub const NAME: &str = $name;
+                pub const NAME: &'static str = $name;
 
                 pub fn init(global_config: &$crate::config::GlobalConfig) -> Option<Box<dyn Client>> {
                     let model = global_config.read().model.clone();
@@ -186,22 +186,22 @@ pub trait Client {
         Ok(client)
     }
 
-    fn send_message(&self, content: &str) -> Result<String> {
+    fn send_message(&self, input: Input) -> Result<String> {
         init_tokio_runtime()?.block_on(async {
             let global_config = self.config().0;
             if global_config.read().dry_run {
-                let content = global_config.read().echo_messages(content);
+                let content = global_config.read().echo_messages(&input);
                 return Ok(content);
             }
             let client = self.build_client()?;
-            let data = global_config.read().prepare_send_data(content, false)?;
+            let data = global_config.read().prepare_send_data(&input, false)?;
             self.send_message_inner(&client, data)
                 .await
                 .with_context(|| "Failed to get answer")
         })
     }
 
-    fn send_message_streaming(&self, content: &str, handler: &mut ReplyHandler) -> Result<()> {
+    fn send_message_streaming(&self, input: &Input, handler: &mut ReplyHandler) -> Result<()> {
         async fn watch_abort(abort: AbortSignal) {
             loop {
                 if abort.aborted() {
@@ -211,12 +211,13 @@ pub trait Client {
             }
         }
         let abort = handler.get_abort();
-        init_tokio_runtime()?.block_on(async {
+        let input = input.clone();
+        init_tokio_runtime()?.block_on(async move {
             tokio::select! {
                 ret = async {
                     let global_config = self.config().0;
                     if global_config.read().dry_run {
-                        let content = global_config.read().echo_messages(content);
+                        let content = global_config.read().echo_messages(&input);
                         let tokens = tokenize(&content);
                         for token in tokens {
                             tokio::time::sleep(Duration::from_millis(10)).await;
@@ -225,7 +226,7 @@ pub trait Client {
                         return Ok(());
                     }
                     let client = self.build_client()?;
-                    let data = global_config.read().prepare_send_data(content, true)?;
+                    let data = global_config.read().prepare_send_data(&input, true)?;
                     self.send_message_streaming_inner(&client, handler, data).await
                 } => {
                     handler.done()?;

diff --git a/src/client/ernie.rs b/src/client/ernie.rs
@@ -1,4 +1,4 @@
-use super::{ErnieClient, Client, ExtraConfig, PromptType, SendData, Model};
+use super::{ErnieClient, Client, ExtraConfig, PromptType, SendData, Model, MessageContent};
 
 use crate::{
     config::GlobalConfig,
@@ -198,8 +198,10 @@ fn build_body(data: SendData, _model: String) -> Value {
 
     if messages[0].role.is_system() {
         let system_message = messages.remove(0);
-        if let Some(message) = messages.get_mut(0) {
-            message.content = format!("{}\n\n{}", system_message.content, message.content)
+        if let (Some(message), MessageContent::Text(system_text)) = (messages.get_mut(0), system_message.content) {
+            if let MessageContent::Text(text)  = message.content.clone() {
+                message.content = MessageContent::Text(format!("{}\n\n{}", system_text, text))
+            }
         }
     }