Skip to content

Commit 2e1101a

Browse files
authoredMar 13, 2025··
feat (provider/openai): pdf input support (#5194)
1 parent 942af2c commit 2e1101a

11 files changed

+299
-6
lines changed
 

‎.changeset/odd-squids-unite.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
'@ai-sdk/provider': patch
3+
'@ai-sdk/openai': patch
4+
'ai': patch
5+
---
6+
7+
feat (provider/openai): pdf input support

‎content/docs/02-foundations/03-prompts.mdx

+3-2
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ const result = await generateText({
201201
Generative AI](/providers/ai-sdk-providers/google-generative-ai), [Google
202202
Vertex AI](/providers/ai-sdk-providers/google-vertex),
203203
[OpenAI](/providers/ai-sdk-providers/openai) (for `wav` and `mp3` audio with
204-
`gpt-4o-audio-preview`), [Anthropic](/providers/ai-sdk-providers/anthropic)
205-
(for `pdf`).
204+
`gpt-4o-audio-preview`), [Anthropic](/providers/ai-sdk-providers/anthropic),
205+
[OpenAI responses](/providers/ai-sdk-providers/openai) (for `pdf`).
206206
</Note>
207207

208208
User messages can include file parts. A file can be one of the following:
@@ -237,6 +237,7 @@ const result = await generateText({
237237
type: 'file',
238238
mimeType: 'application/pdf',
239239
data: fs.readFileSync('./data/example.pdf'),
240+
filename: 'example.pdf', // optional, not used by all providers
240241
},
241242
],
242243
},
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { openai } from '@ai-sdk/openai';
2+
import { generateText } from 'ai';
3+
import 'dotenv/config';
4+
5+
async function main() {
6+
const result = await generateText({
7+
model: openai.responses('gpt-4o'),
8+
messages: [
9+
{
10+
role: 'user',
11+
content: [
12+
{
13+
type: 'text',
14+
text: 'What is an embedding model according to this document?',
15+
},
16+
{
17+
type: 'file',
18+
data: new URL(
19+
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/ai.pdf?raw=true',
20+
),
21+
mimeType: 'application/pdf',
22+
filename: 'ai.pdf',
23+
},
24+
],
25+
},
26+
],
27+
});
28+
29+
console.log(result.text);
30+
}
31+
32+
main().catch(console.error);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { openai } from '@ai-sdk/openai';
2+
import { generateText } from 'ai';
3+
import 'dotenv/config';
4+
import fs from 'node:fs';
5+
6+
async function main() {
7+
const result = await generateText({
8+
model: openai.responses('gpt-4o'),
9+
messages: [
10+
{
11+
role: 'user',
12+
content: [
13+
{
14+
type: 'text',
15+
text: 'What is an embedding model according to this document?',
16+
},
17+
{
18+
type: 'file',
19+
data: fs.readFileSync('./data/ai.pdf'),
20+
mimeType: 'application/pdf',
21+
// filename: 'ai.pdf',
22+
},
23+
],
24+
},
25+
],
26+
});
27+
28+
console.log(result.text);
29+
}
30+
31+
main().catch(console.error);

‎packages/ai/core/prompt/content-part.ts

+6
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ File data. Can either be:
101101
*/
102102
data: DataContent | URL;
103103

104+
/**
105+
Optional filename of the file.
106+
*/
107+
filename?: string;
108+
104109
/**
105110
Mime type of the file.
106111
*/
@@ -125,6 +130,7 @@ functionality that can be fully encapsulated in the provider.
125130
export const filePartSchema: z.ZodType<FilePart> = z.object({
126131
type: z.literal('file'),
127132
data: z.union([dataContentSchema, z.instanceof(URL)]),
133+
filename: z.string().optional(),
128134
mimeType: z.string(),
129135
providerOptions: providerMetadataSchema.optional(),
130136
experimental_providerMetadata: providerMetadataSchema.optional(),

‎packages/ai/core/prompt/convert-to-language-model-prompt.test.ts

+81
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,87 @@ describe('convertToLanguageModelPrompt', () => {
444444
},
445445
]);
446446
});
447+
448+
it('should handle file parts with filename', async () => {
449+
const result = await convertToLanguageModelPrompt({
450+
prompt: {
451+
type: 'messages',
452+
messages: [
453+
{
454+
role: 'user',
455+
content: [
456+
{
457+
type: 'file',
458+
data: 'SGVsbG8sIFdvcmxkIQ==', // "Hello, World!" in base64
459+
mimeType: 'text/plain',
460+
filename: 'hello.txt',
461+
},
462+
],
463+
},
464+
],
465+
},
466+
modelSupportsImageUrls: true,
467+
modelSupportsUrl: undefined,
468+
});
469+
470+
expect(result).toEqual([
471+
{
472+
role: 'user',
473+
content: [
474+
{
475+
type: 'file',
476+
data: 'SGVsbG8sIFdvcmxkIQ==',
477+
mimeType: 'text/plain',
478+
filename: 'hello.txt',
479+
},
480+
],
481+
},
482+
]);
483+
});
484+
485+
it('should preserve filename when downloading file from URL', async () => {
486+
const result = await convertToLanguageModelPrompt({
487+
prompt: {
488+
type: 'messages',
489+
messages: [
490+
{
491+
role: 'user',
492+
content: [
493+
{
494+
type: 'file',
495+
data: new URL('https://example.com/document.pdf'),
496+
mimeType: 'application/pdf',
497+
filename: 'important-document.pdf',
498+
},
499+
],
500+
},
501+
],
502+
},
503+
modelSupportsImageUrls: false,
504+
modelSupportsUrl: () => false,
505+
downloadImplementation: async ({ url }) => {
506+
expect(url).toEqual(new URL('https://example.com/document.pdf'));
507+
return {
508+
data: new Uint8Array([0, 1, 2, 3]),
509+
mimeType: 'application/pdf',
510+
};
511+
},
512+
});
513+
514+
expect(result).toEqual([
515+
{
516+
role: 'user',
517+
content: [
518+
{
519+
type: 'file',
520+
mimeType: 'application/pdf',
521+
data: convertUint8ArrayToBase64(new Uint8Array([0, 1, 2, 3])),
522+
filename: 'important-document.pdf',
523+
},
524+
],
525+
},
526+
]);
527+
});
447528
});
448529

449530
describe('provider metadata', async () => {

‎packages/ai/core/prompt/convert-to-language-model-prompt.ts

+1
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ function convertPartToLanguageModelPart(
323323
normalizedData instanceof Uint8Array
324324
? convertDataContentToBase64String(normalizedData)
325325
: normalizedData,
326+
filename: part.filename,
326327
mimeType,
327328
providerMetadata:
328329
part.providerOptions ?? part.experimental_providerMetadata,

‎packages/openai/src/responses/convert-to-openai-responses-messages.test.ts

+109
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,115 @@ describe('convertToOpenAIResponsesMessages', () => {
174174
},
175175
]);
176176
});
177+
178+
it('should convert messages with PDF file parts', async () => {
179+
const base64Data = 'AQIDBAU='; // Base64 encoding of pdfData
180+
181+
const result = convertToOpenAIResponsesMessages({
182+
prompt: [
183+
{
184+
role: 'user',
185+
content: [
186+
{
187+
type: 'file',
188+
mimeType: 'application/pdf',
189+
data: base64Data,
190+
filename: 'document.pdf',
191+
},
192+
],
193+
},
194+
],
195+
systemMessageMode: 'system',
196+
});
197+
198+
expect(result.messages).toEqual([
199+
{
200+
role: 'user',
201+
content: [
202+
{
203+
type: 'input_file',
204+
filename: 'document.pdf',
205+
file_data: 'data:application/pdf;base64,AQIDBAU=',
206+
},
207+
],
208+
},
209+
]);
210+
});
211+
212+
it('should use default filename for PDF file parts when not provided', async () => {
213+
const base64Data = 'AQIDBAU=';
214+
215+
const result = convertToOpenAIResponsesMessages({
216+
prompt: [
217+
{
218+
role: 'user',
219+
content: [
220+
{
221+
type: 'file',
222+
mimeType: 'application/pdf',
223+
data: base64Data,
224+
},
225+
],
226+
},
227+
],
228+
systemMessageMode: 'system',
229+
});
230+
231+
expect(result.messages).toEqual([
232+
{
233+
role: 'user',
234+
content: [
235+
{
236+
type: 'input_file',
237+
filename: 'part-0.pdf',
238+
file_data: 'data:application/pdf;base64,AQIDBAU=',
239+
},
240+
],
241+
},
242+
]);
243+
});
244+
245+
it('should throw error for unsupported file types', async () => {
246+
const base64Data = 'AQIDBAU=';
247+
248+
expect(() => {
249+
convertToOpenAIResponsesMessages({
250+
prompt: [
251+
{
252+
role: 'user',
253+
content: [
254+
{
255+
type: 'file',
256+
mimeType: 'text/plain',
257+
data: base64Data,
258+
},
259+
],
260+
},
261+
],
262+
systemMessageMode: 'system',
263+
});
264+
}).toThrow('Only PDF files are supported in user messages');
265+
});
266+
267+
it('should throw error for file URLs', async () => {
268+
expect(() => {
269+
convertToOpenAIResponsesMessages({
270+
prompt: [
271+
{
272+
role: 'user',
273+
content: [
274+
{
275+
type: 'file',
276+
mimeType: 'application/pdf',
277+
data: new URL('https://example.com/document.pdf'),
278+
},
279+
],
280+
},
281+
],
282+
systemMessageMode: 'system',
283+
});
284+
}).toThrow('File URLs in user messages');
285+
});
177286
});
178287

179288
describe('assistant messages', () => {

‎packages/openai/src/responses/convert-to-openai-responses-messages.ts

+23-4
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ export function convertToOpenAIResponsesMessages({
5151
case 'user': {
5252
messages.push({
5353
role: 'user',
54-
content: content.map(part => {
54+
content: content.map((part, index) => {
5555
switch (part.type) {
5656
case 'text': {
5757
return { type: 'input_text', text: part.text };
@@ -71,9 +71,28 @@ export function convertToOpenAIResponsesMessages({
7171
};
7272
}
7373
case 'file': {
74-
throw new UnsupportedFunctionalityError({
75-
functionality: 'Image content parts in user messages',
76-
});
74+
if (part.data instanceof URL) {
75+
// The AI SDK automatically downloads files for user file parts with URLs
76+
throw new UnsupportedFunctionalityError({
77+
functionality: 'File URLs in user messages',
78+
});
79+
}
80+
81+
switch (part.mimeType) {
82+
case 'application/pdf': {
83+
return {
84+
type: 'input_file',
85+
filename: part.filename ?? `part-${index}.pdf`,
86+
file_data: `data:application/pdf;base64,${part.data}`,
87+
};
88+
}
89+
default: {
90+
throw new UnsupportedFunctionalityError({
91+
functionality:
92+
'Only PDF files are supported in user messages',
93+
});
94+
}
95+
}
7796
}
7897
}
7998
}),

‎packages/openai/src/responses/openai-responses-api-types.ts

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export type OpenAIResponsesUserMessage = {
1919
content: Array<
2020
| { type: 'input_text'; text: string }
2121
| { type: 'input_image'; image_url: string }
22+
| { type: 'input_file'; filename: string; file_data: string }
2223
>;
2324
};
2425

‎packages/provider/src/language-model/v1/language-model-v1-prompt.ts

+5
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ File content part of a prompt. It contains a file.
143143
export interface LanguageModelV1FilePart {
144144
type: 'file';
145145

146+
/**
147+
* Optional filename of the file.
148+
*/
149+
filename?: string;
150+
146151
/**
147152
File data as base64 encoded string or as a URL.
148153
*/

0 commit comments

Comments
 (0)
Please sign in to comment.