sourcegraph · PriNova · Dec 14, 2024 · Dec 14, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/lib/shared/src/llm-providers/google/chat-client.ts b/lib/shared/src/llm-providers/google/chat-client.ts
@@ -1,4 +1,4 @@
-import type { GeminiCompletionResponse } from '.'
+import type { GeminiChatMessage, GeminiCompletionResponse } from '.'
 import type { ChatNetworkClientParams } from '..'
 import { getCompletionsModelConfig, logDebug } from '../..'
 import { onAbort } from '../../common/abortController'
@@ -46,6 +46,19 @@ export async function googleChatClient({
     // Construct the messages array for the API
     const messages = await constructGeminiChatMessages(params.messages)
 
+    // Adds an inline image data part to the last user message in the `messages` array, if the `params.images` array has at least one element.
+    if (params.images !== undefined) {
+        const lastUserMessage = messages.at(-1) as GeminiChatMessage | undefined
+        if (lastUserMessage?.role === 'user') {
+            lastUserMessage.parts.push({
+                inline_data: {
+                    mime_type: params.images[0].mimeType,
+                    data: params.images[0].data,
+                },
+            })
+        }
+    }
+
     // Sends the completion parameters and callbacks to the API.
     fetch(apiEndpoint, {
         method: 'POST',

diff --git a/lib/shared/src/llm-providers/google/index.ts b/lib/shared/src/llm-providers/google/index.ts
@@ -13,7 +13,23 @@ export interface GeminiCompletionResponse {
     }[]
 }
 
+export interface ImageData {
+    data: string
+    mimeType: MimeType
+}
+
+export type MimeType = 'image/jpeg' | 'image/png' | 'image/webp'
+export interface InlineDataPart {
+    inline_data: {
+        mime_type: MimeType
+        data: string
+    }
+}
+export interface Part {
+    text: string
+}
+
 export interface GeminiChatMessage {
     role: string
-    parts: { text: string }[]
+    parts: (Part | InlineDataPart)[]
 }
diff --git a/lib/shared/src/models/sync.ts b/lib/shared/src/models/sync.ts
@@ -418,8 +418,11 @@ function getModelsFromVSCodeConfiguration({
     configuration: { devModels },
 }: PickResolvedConfiguration<{ configuration: 'devModels' }>): Model[] {
     return (
-        devModels?.map(m =>
-            createModel({
+        devModels?.map(m => {
+            const isGeminiFlash = m?.model.includes('gemini-2.0-flash')
+            const baseTags = [ModelTag.BYOK, ModelTag.Experimental, ModelTag.Local]
+            const tags = isGeminiFlash ? [...baseTags, ModelTag.Vision] : [...baseTags]
+            return createModel({
                 id: `${m.provider}/${m.model}`,
                 usage: [ModelUsage.Chat, ModelUsage.Edit],
                 contextWindow: {
@@ -431,9 +434,9 @@ function getModelsFromVSCodeConfiguration({
                     apiEndpoint: m.apiEndpoint,
                     options: m.options,
                 },
-                tags: [ModelTag.Local, ModelTag.BYOK, ModelTag.Experimental],
+                tags: tags,
             })
-        ) ?? []
+        }) ?? []
     )
 }
 

diff --git a/lib/shared/src/sourcegraph-api/completions/types.ts b/lib/shared/src/sourcegraph-api/completions/types.ts
@@ -1,4 +1,5 @@
 import type { SerializedChatMessage } from '../../chat/transcript/messages'
+import type { ImageData } from '../../llm-providers/google'
 import type { PromptString } from '../../prompt/prompt-string'
 
 interface DoneEvent {
@@ -37,6 +38,7 @@ export interface CompletionParameters {
     topP?: number
     model?: string
     stream?: boolean
+    images?: ImageData[]
     // Configuration for a Predicted Output, which can greatly improve response
     // times when large parts of the model response are known ahead of time.
     // https://platform.openai.com/docs/guides/latency-optimization#use-predicted-outputs

diff --git a/vscode/CHANGELOG.md b/vscode/CHANGELOG.md
@@ -6,6 +6,8 @@ This is a log of all notable changes to Cody for VS Code.
 
 ### Added
 
+- Add Gemini Flash 2.0 experimental vision model support via cody dev models flag [pull/6354](https://github.com/sourcegraph/cody/pull/6354)
+
 ### Fixed
 
 ### Changed

diff --git a/vscode/src/chat/chat-view/ChatBuilder.ts b/vscode/src/chat/chat-view/ChatBuilder.ts
@@ -20,6 +20,7 @@ import {
 } from '@sourcegraph/cody-shared'
 
 import type { RankedContext } from '@sourcegraph/cody-shared/src/chat/transcript/messages'
+import type { ImageData, MimeType } from '@sourcegraph/cody-shared/src/llm-providers/google'
 import { Observable, Subject, map } from 'observable-fns'
 import { getChatPanelTitle } from './chat-helpers'
 
@@ -94,7 +95,8 @@ export class ChatBuilder {
 
         public readonly sessionID: string = new Date(Date.now()).toUTCString(),
         private messages: ChatMessage[] = [],
-        private customChatTitle?: string
+        private customChatTitle?: string,
+        private images: ImageData[] = []
-        private images: ImageData[] = []
+        private images: ImageData[] = []
-        private images: ImageData[] = []
+        private images: ImageData[] = []
     ) {}
 
     /** An observable that emits whenever the {@link ChatBuilder}'s chat changes. */
@@ -310,6 +312,62 @@ export class ChatBuilder {
         }
         return result
     }
+
+    /**
+     * Adds images to the `ChatBuilder`.
+     *
+     * @param imageUris - A string containing the base64-encoded image data.
+     * @returns A `Promise` that resolves when the images have been added.
+     */
+    public async addImage(imageUri: string): Promise<void> {
+        this.images = []
+        if (imageUri === '') {
+            return
+        }
+        this.images.push({
+            data: imageUri,
+            mimeType: this.detectMimeType(imageUri),
+        })
+    }
+
+    /**
+     * Retrieves and resets the images collected by the `ChatBuilder`.
+     *
+     * @returns The array of `ImageData` objects collected so far, or `undefined` if no images have been collected.
+     */
+    public getAndResetImages(): ImageData[] | undefined {
+        const images = this.images
+        this.images = []
+        return images.length ? images : undefined
+    }
+
+    /**
+     * Detects the MIME type of an image encoded in base64 format.
+     *
+     * @param base64String - The base64-encoded image data.
+     * @returns The MIME type of the image, such as 'image/jpeg', 'image/png', or 'image/webp'. If the MIME type cannot be detected, it defaults to 'image/jpeg'.
+     */
+    private detectMimeType(base64String: string): MimeType {
+        // Remove data URI prefix if present
+        const base64Data = base64String.replace(/^data:image\/\w+;base64,/, '')
+
+        // Get first 10 bytes from base64
+        const binaryStart = atob(base64Data).slice(0, 10)
+
+        // Check magic numbers using charCodes
+        if (binaryStart.charCodeAt(0) === 0xff && binaryStart.charCodeAt(1) === 0xd8) {
+            return 'image/jpeg'
+        }
+        if (binaryStart.charCodeAt(0) === 0x89 && binaryStart.charCodeAt(1) === 0x50) {
+            return 'image/png'
+        }
+        if (binaryStart.charCodeAt(8) === 0x57 && binaryStart.charCodeAt(9) === 0x45) {
+            return 'image/webp'
+        }
+
+        // Default to jpeg if unknown
+        return 'image/jpeg'
+    }
 }
 
 function messageToSerializedChatInteraction(

diff --git a/vscode/src/chat/chat-view/ChatController.ts b/vscode/src/chat/chat-view/ChatController.ts
@@ -543,6 +543,10 @@ export class ChatController implements vscode.Disposable, vscode.WebviewViewProv
                 logger(message.filterLabel, message.message)
                 break
             }
+            case 'chat/upload-image': {
+                await this.chatBuilder.addImage(message.image)
+                break
+            }
         }
     }
 
@@ -1746,6 +1750,7 @@ export class ChatController implements vscode.Disposable, vscode.WebviewViewProv
             const params = {
                 model,
                 maxTokensToSample: contextWindow.output,
+                images: this.chatBuilder.getAndResetImages(),
             } as CompletionParameters
 
             // Set stream param only when the model is disabled for streaming.

diff --git a/vscode/src/chat/protocol.ts b/vscode/src/chat/protocol.ts
@@ -141,6 +141,10 @@ export type WebviewMessage =
           filterLabel: string
           message: string
       }
+    | {
+          command: 'chat/upload-image'
+          image: string
+      }
     | {
           command: 'reevaluateSearchWithSelectedFilters'
           index: number