feat: custom file formats in file content loader (#12047)

* add custom file format support * add tests * lint/format * changeset * nits * finish tests * add nested json test * requested changes * update changeset with @sarah11918 suggestions * typos/formatting * add map<id, data> yaml test * fix tests and rebase
2024-12-30 22:03:56 -05:00 · 2024-09-29 14:41:15 -04:00 · 2024-09-29 14:41:15 -04:00 · 21b5e806c5
commit 21b5e806c5
parent 0a1036eef6
10 changed files with 408 additions and 29 deletions
--- a/.changeset/lovely-pianos-breathe.md
+++ b/.changeset/lovely-pianos-breathe.md
@ -0,0 +1,68 @@
+---
+'astro': minor
+---
+
+Adds a new optional `parser` property to the built-in `file()` loader for content collections to support additional file types such as `toml` and `csv`.
+
+The `file()` loader now accepts a second argument that defines a `parser` function. This allows you to specify a custom parser (e.g. `toml.parse` or `csv-parse`) to create a collection from a file's contents. The `file()` loader will automatically detect and parse JSON and YAML files (based on their file extension) with no need for a `parser`.
+
+This works with any type of custom file formats including `csv` and `toml`. The following example defines a content collection `dogs` using a `.toml` file.
+```toml
+[[dogs]]
+id = "..."
+age = "..."
+
+[[dogs]]
+id = "..."
+age = "..."
+```
+After importing TOML's parser, you can load the `dogs` collection into your project by passing both a file path and `parser` to the `file()` loader.
+```typescript
+import { defineCollection } from "astro:content"
+import { file } from "astro/loaders"
+import { parse as parseToml } from "toml"
+
+const dogs = defineCollection({
+  loader: file("src/data/dogs.toml", { parser: (text) => parseToml(text).dogs }),
+  schema: /* ... */
+})
+
+// it also works with CSVs!
+import { parse as parseCsv } from "csv-parse/sync";
+
+const cats = defineCollection({
+  loader: file("src/data/cats.csv", { parser: (text) => parseCsv(text, { columns: true, skipEmptyLines: true })})
+});
+```
+
+The `parser` argument also allows you to load a single collection from a nested JSON document. For example, this JSON file contains multiple collections:
+```json
+{"dogs": [{}], "cats": [{}]}
+```
+
+You can seperate these collections by passing a custom `parser` to the `file()` loader like so:
+```typescript
+const dogs = defineCollection({
+  loader: file("src/data/pets.json", { parser: (text) => JSON.parse(text).dogs })
+});
+const cats = defineCollection({
+  loader: file("src/data/pets.json", { parser: (text) => JSON.parse(text).cats })
+});
+```
+
+And it continues to work with maps of `id` to `data`
+```yaml
+bubbles:
+  breed: "Goldfish"
+  age: 2
+finn:
+  breed: "Betta"
+  age: 1
+```
+
+```typescript
+const fish = defineCollection({ 
+  loader: file("src/data/fish.yaml"), 
+  schema: z.object({ breed: z.string(), age: z.number() }) 
+});
+```
--- a/packages/astro/src/content/loaders/file.ts
+++ b/packages/astro/src/content/loaders/file.ts
@ -1,25 +1,56 @@
 import { promises as fs, existsSync } from 'node:fs';
 import { fileURLToPath } from 'node:url';
+import yaml from 'js-yaml';
 import { posixRelative } from '../utils.js';
 import type { Loader, LoaderContext } from './types.js';

+export interface FileOptions {
+	/**
+	 * the parsing function to use for this data
+	 * @default JSON.parse or yaml.load, depending on the extension of the file
+	 * */
+	parser?: (
+		text: string,
+	) => Record<string, Record<string, unknown>> | Array<Record<string, unknown>>;
+}
+
 /**
 * Loads entries from a JSON file. The file must contain an array of objects that contain unique `id` fields, or an object with string keys.
- * @todo Add support for other file types, such as YAML, CSV etc.
 * @param fileName The path to the JSON file to load, relative to the content directory.
+ * @param options Additional options for the file loader
 */
-export function file(fileName: string): Loader {
+export function file(fileName: string, options?: FileOptions): Loader {
 	if (fileName.includes('*')) {
 		// TODO: AstroError
 		throw new Error('Glob patterns are not supported in `file` loader. Use `glob` loader instead.');
 	}

+	let parse: ((text: string) => any) | null = null;
+
+	const ext = fileName.split('.').at(-1);
+	if (ext === 'json') {
+		parse = JSON.parse;
+	} else if (ext === 'yml' || ext === 'yaml') {
+		parse = (text) =>
+			yaml.load(text, {
+				filename: fileName,
+			});
+	}
+	if (options?.parser) parse = options.parser;
+
+	if (parse === null) {
+		// TODO: AstroError
+		throw new Error(
+			`No parser found for file '${fileName}'. Try passing a parser to the \`file\` loader.`,
+		);
+	}
+
 	async function syncData(filePath: string, { logger, parseData, store, config }: LoaderContext) {
-		let json: Array<Record<string, unknown>>;
+		let data: Array<Record<string, unknown>> | Record<string, Record<string, unknown>>;

 		try {
-			const data = await fs.readFile(filePath, 'utf-8');
-			json = JSON.parse(data);
+			const contents = await fs.readFile(filePath, 'utf-8');
+			data = parse!(contents);
 		} catch (error: any) {
 			logger.error(`Error reading data from ${fileName}`);
 			logger.debug(error.message);
@ -28,28 +59,28 @@ export function file(fileName: string): Loader {

 		const normalizedFilePath = posixRelative(fileURLToPath(config.root), filePath);

-		if (Array.isArray(json)) {
-			if (json.length === 0) {
+		if (Array.isArray(data)) {
+			if (data.length === 0) {
 				logger.warn(`No items found in ${fileName}`);
 			}
-			logger.debug(`Found ${json.length} item array in ${fileName}`);
+			logger.debug(`Found ${data.length} item array in ${fileName}`);
 			store.clear();
-			for (const rawItem of json) {
+			for (const rawItem of data) {
 				const id = (rawItem.id ?? rawItem.slug)?.toString();
 				if (!id) {
 					logger.error(`Item in ${fileName} is missing an id or slug field.`);
 					continue;
 				}
-				const data = await parseData({ id, data: rawItem, filePath });
-				store.set({ id, data, filePath: normalizedFilePath });
+				const parsedData = await parseData({ id, data: rawItem, filePath });
+				store.set({ id, data: parsedData, filePath: normalizedFilePath });
 			}
-		} else if (typeof json === 'object') {
-			const entries = Object.entries<Record<string, unknown>>(json);
+		} else if (typeof data === 'object') {
+			const entries = Object.entries<Record<string, unknown>>(data);
 			logger.debug(`Found object with ${entries.length} entries in ${fileName}`);
 			store.clear();
 			for (const [id, rawItem] of entries) {
-				const data = await parseData({ id, data: rawItem, filePath });
-				store.set({ id, data, filePath: normalizedFilePath });
+				const parsedData = await parseData({ id, data: rawItem, filePath });
+				store.set({ id, data: parsedData, filePath: normalizedFilePath });
 			}
 		} else {
 			logger.error(`Invalid data in ${fileName}. Must be an array or object.`);
--- a/packages/astro/test/content-layer.test.js
+++ b/packages/astro/test/content-layer.test.js
@ -53,11 +53,11 @@ describe('Content Layer', () => {
 			assert.equal(json.customLoader.length, 5);
 		});

-		it('Returns `file()` loader collection', async () => {
-			assert.ok(json.hasOwnProperty('fileLoader'));
-			assert.ok(Array.isArray(json.fileLoader));
+		it('Returns json `file()` loader collection', async () => {
+			assert.ok(json.hasOwnProperty('jsonLoader'));
+			assert.ok(Array.isArray(json.jsonLoader));

-			const ids = json.fileLoader.map((item) => item.data.id);
+			const ids = json.jsonLoader.map((item) => item.data.id);
 			assert.deepEqual(ids, [
 				'labrador-retriever',
 				'german-shepherd',
@ -97,6 +97,58 @@ describe('Content Layer', () => {
 			);
 		});

+		it('Returns nested json `file()` loader collection', async () => {
+			assert.ok(json.hasOwnProperty('nestedJsonLoader'));
+			assert.ok(Array.isArray(json.nestedJsonLoader));
+
+			const ids = json.nestedJsonLoader.map((item) => item.data.id);
+			assert.deepEqual(ids, ['bluejay', 'robin', 'sparrow', 'cardinal', 'goldfinch']);
+		});
+
+		it('Returns yaml `file()` loader collection', async () => {
+			assert.ok(json.hasOwnProperty('yamlLoader'));
+			assert.ok(Array.isArray(json.yamlLoader));
+
+			const ids = json.yamlLoader.map((item) => item.id);
+			assert.deepEqual(ids, [
+				'bubbles',
+				'finn',
+				'shadow',
+				'spark',
+				'splash',
+				'nemo',
+				'angel-fish',
+				'gold-stripe',
+				'blue-tail',
+				'bubble-buddy',
+			]);
+		});
+
+		it('Returns toml `file()` loader collection', async () => {
+			assert.ok(json.hasOwnProperty('tomlLoader'));
+			assert.ok(Array.isArray(json.tomlLoader));
+
+			const ids = json.tomlLoader.map((item) => item.data.id);
+			assert.deepEqual(ids, [
+				'crown',
+				'nikes-on-my-feet',
+				'stars',
+				'never-let-me-down',
+				'no-church-in-the-wild',
+				'family-ties',
+				'somebody',
+				'honest',
+			]);
+		});
+
+		it('Returns nested json `file()` loader collection', async () => {
+			assert.ok(json.hasOwnProperty('nestedJsonLoader'));
+			assert.ok(Array.isArray(json.nestedJsonLoader));
+
+			const ids = json.nestedJsonLoader.map((item) => item.data.id);
+			assert.deepEqual(ids, ['bluejay', 'robin', 'sparrow', 'cardinal', 'goldfinch']);
+		});
+
 		it('Returns data entry by id', async () => {
 			assert.ok(json.hasOwnProperty('dataEntry'));
 			assert.equal(json.dataEntry.filePath?.split(sep).join(posixSep), 'src/data/dogs.json');
@ -276,10 +328,10 @@ describe('Content Layer', () => {
 		});

 		it('Returns `file()` loader collection', async () => {
-			assert.ok(json.hasOwnProperty('fileLoader'));
-			assert.ok(Array.isArray(json.fileLoader));
+			assert.ok(json.hasOwnProperty('jsonLoader'));
+			assert.ok(Array.isArray(json.jsonLoader));

-			const ids = json.fileLoader.map((item) => item.data.id);
+			const ids = json.jsonLoader.map((item) => item.data.id);
 			assert.deepEqual(ids, [
 				'labrador-retriever',
 				'german-shepherd',
@ -348,7 +400,7 @@ describe('Content Layer', () => {
 		it('updates collection when data file is changed', async () => {
 			const rawJsonResponse = await fixture.fetch('/collections.json');
 			const initialJson = devalue.parse(await rawJsonResponse.text());
-			assert.equal(initialJson.fileLoader[0].data.temperament.includes('Bouncy'), false);
+			assert.equal(initialJson.jsonLoader[0].data.temperament.includes('Bouncy'), false);

 			await fixture.editFile('/src/data/dogs.json', (prev) => {
 				const data = JSON.parse(prev);
@ -359,7 +411,7 @@ describe('Content Layer', () => {
 			await fixture.onNextDataStoreChange();
 			const updatedJsonResponse = await fixture.fetch('/collections.json');
 			const updated = devalue.parse(await updatedJsonResponse.text());
-			assert.ok(updated.fileLoader[0].data.temperament.includes('Bouncy'));
+			assert.ok(updated.jsonLoader[0].data.temperament.includes('Bouncy'));
 			await fixture.resetAllFiles();
 		});
 	});
--- a/packages/astro/test/fixtures/content-layer/package.json
+++ b/packages/astro/test/fixtures/content-layer/package.json
@ -4,6 +4,7 @@
  "private": true,
  "dependencies": {
    "astro": "workspace:*",
-    "@astrojs/mdx": "workspace:*"
+    "@astrojs/mdx": "workspace:*",
+    "toml": "^3.0.0"
  }
 }
--- a/packages/astro/test/fixtures/content-layer/src/content/config.ts
+++ b/packages/astro/test/fixtures/content-layer/src/content/config.ts
@ -1,6 +1,7 @@
 import { defineCollection, z, reference } from 'astro:content';
 import { file, glob } from 'astro/loaders';
 import { loader } from '../loaders/post-loader.js';
+import { parse as parseToml } from 'toml';

 const blog = defineCollection({
 	loader: loader({ url: 'https://jsonplaceholder.typicode.com/posts' }),
@ -118,6 +119,27 @@ const cats = defineCollection({
 	}),
 });

+const fish = defineCollection({
+	loader: file('src/data/fish.yaml'),
+	schema: z.object({
+		name: z.string(),
+		breed: z.string(),
+		age: z.number(),
+	}),
+});
+
+const birds = defineCollection({
+	loader: file('src/data/birds.json', {
+		parser: (text) => JSON.parse(text).birds,
+	}),
+	schema: z.object({
+		id: z.string(),
+		name: z.string(),
+		breed: z.string(),
+		age: z.number(),
+	}),
+});
+
 // Absolute paths should also work
 const absoluteRoot = new URL('../../content/space', import.meta.url);

@ -198,14 +220,36 @@ const increment = defineCollection({
 	},
 });

+const artists = defineCollection({
+        loader: file('src/data/music.toml', { parser: (text) => parseToml(text).artists }),
+        schema: z.object({
+                id: z.string(),
+                name: z.string(),
+                genre: z.string().array(),
+        }),
+});
+
+const songs = defineCollection({
+        loader: file('src/data/music.toml', { parser: (text) => parseToml(text).songs }),
+        schema: z.object({
+                id: z.string(),
+                name: z.string(),
+                artists: z.array(reference('artists')),
+        }),
+});
+
 export const collections = {
 	blog,
 	dogs,
 	cats,
+	fish,
+	birds,
 	numbers,
 	spacecraft,
 	increment,
 	images,
+	artists,
+	songs,
 	probes,
 	rodents,
 };
--- a/packages/astro/test/fixtures/content-layer/src/data/birds.json
+++ b/packages/astro/test/fixtures/content-layer/src/data/birds.json
@ -0,0 +1,34 @@
+{
+  "birds": [
+    {
+      "id": "bluejay",
+      "name": "Blue Jay",
+      "breed": "Cyanocitta cristata",
+      "age": 3
+    },
+    {
+      "id": "robin",
+      "name": "Robin",
+      "breed": "Turdus migratorius",
+      "age": 2
+    },
+    {
+      "id": "sparrow",
+      "name": "Sparrow",
+      "breed": "Passer domesticus",
+      "age": 1
+    },
+    {
+      "id": "cardinal",
+      "name": "Cardinal",
+      "breed": "Cardinalis cardinalis",
+      "age": 4
+    },
+    {
+      "id": "goldfinch",
+      "name": "Goldfinch",
+      "breed": "Spinus tristis",
+      "age": 2
+    }
+  ]
+}
--- a/packages/astro/test/fixtures/content-layer/src/data/fish.yaml
+++ b/packages/astro/test/fixtures/content-layer/src/data/fish.yaml
@ -0,0 +1,42 @@
+# map of ids to data
+bubbles:
+  name: "Bubbles"
+  breed: "Goldfish"
+  age: 2
+finn:
+  name: "Finn"
+  breed: "Betta"
+  age: 1
+shadow:
+  name: "Shadow"
+  breed: "Catfish"
+  age: 3
+spark:
+  name: "Spark"
+  breed: "Tetra"
+  age: 1
+splash:
+  name: "Splash"
+  breed: "Guppy"
+  age: 2
+nemo:
+  name: "Nemo"
+  breed: "Clownfish"
+  age: 3
+angel-fish:
+  name: "Angel Fish"
+  breed: "Angelfish"
+  age: 4
+gold-stripe:
+  name: "Gold Stripe"
+  breed: "Molly"
+  age: 1
+blue-tail:
+  name: "Blue Tail"
+  breed: "Swordtail"
+  age: 2
+bubble-buddy:
+  name: "Bubble Buddy"
+  breed: "Betta"
+  age: 3
+
--- a/packages/astro/test/fixtures/content-layer/src/data/music.toml
+++ b/packages/astro/test/fixtures/content-layer/src/data/music.toml
@ -0,0 +1,89 @@
+[[artists]]
+id = "kendrick-lamar"
+name = "Kendrick Lamar"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "mac-miller"
+name = "Mac Miller"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "jid"
+name = "JID"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "yasiin-bey"
+name = "Yasiin Bey"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "kanye-west"
+name = "Kanye West"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "jay-z"
+name = "JAY-Z"
+genre = ["Hip-Hop", "Rap"]
+
+[[artists]]
+id = "j-ivy"
+name = "J. Ivy"
+genre = ["Spoken Word", "Rap"]
+
+[[artists]]
+id = "frank-ocean"
+name = "Frank Ocean"
+genre = ["R&B", "Hip-Hop"]
+
+[[artists]]
+id = "the-dream"
+name = "The-Dream"
+genre = ["R&B", "Hip-Hop"]
+
+[[artists]]
+id = "baby-keem"
+name = "Baby Keem"
+genre = ["Hip-Hop", "Rap"]
+
+[[songs]]
+id = "crown"
+name = "Crown"
+artists = ["kendrick-lamar"]
+
+[[songs]]
+id = "nikes-on-my-feet"
+name = "Nikes on My Feet"
+artists = ["mac-miller"]
+
+[[songs]]
+id = "stars"
+name = "Stars"
+artists = ["jid", "yasiin-bey"]
+
+[[songs]]
+id = "never-let-me-down"
+name = "Never Let Me Down"
+artists = ["kanye-west", "jay-z", "j-ivy"]
+
+[[songs]]
+id = "no-church-in-the-wild"
+name = "No Church In The Wild"
+artists = ["jay-z", "kanye-west", "frank-ocean", "the-dream"]
+
+[[songs]]
+id = "family-ties"
+name = "family ties"
+artists = ["kendrick-lamar", "baby-keem"]
+
+[[songs]]
+id = "somebody"
+name = "Somebody"
+artists = ["jid"]
+
+[[songs]]
+id = "honest"
+name = "HONEST"
+artists = ["baby-keem"]
--- a/packages/astro/test/fixtures/content-layer/src/pages/collections.json.js
+++ b/packages/astro/test/fixtures/content-layer/src/pages/collections.json.js
@ -5,7 +5,7 @@ export async function GET() {
 	const customLoader = await getCollection('blog', (entry) => {
 		return entry.data.id < 6;
 	});
-	const fileLoader = await getCollection('dogs');
+	const jsonLoader = await getCollection('dogs');

 	const dataEntry = await getEntry('dogs', 'beagle');

@ -23,10 +23,17 @@ export async function GET() {
 	const simpleLoaderObject = await getCollection('rodents')

 	const probes = await getCollection('probes');
+
+	const yamlLoader = await getCollection('fish');
+
+	const tomlLoader = await getCollection('songs');
+
+	const nestedJsonLoader = await getCollection('birds');
+
 	return new Response(
 		devalue.stringify({
 			customLoader,
-			fileLoader,
+			jsonLoader,
 			dataEntry,
 			simpleLoader,
 			simpleLoaderObject,
@ -35,7 +42,10 @@ export async function GET() {
 			referencedEntry,
 			increment,
 			images, 
-			probes
-		})
+			probes,
+			yamlLoader,
+			tomlLoader,
+			nestedJsonLoader,
+		}),
 	);
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -2713,6 +2713,9 @@ importers:
      astro:
        specifier: workspace:*
        version: link:../../..
+      toml:
+        specifier: ^3.0.0
+        version: 3.0.0

  packages/astro/test/fixtures/content-layer-markdoc:
    dependencies:
@ -10327,6 +10330,9 @@ packages:
    resolution: {integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==}
    engines: {node: '>=0.6'}

+  toml@3.0.0:
+    resolution: {integrity: sha512-y/mWCZinnvxjTKYhJ+pYxwD0mRLVvOtdS2Awbgxln6iEnt4rk0yBxeSBHkGJcPucRiG0e55mwWp+g/05rsrd6w==}
+
  totalist@3.0.1:
    resolution: {integrity: sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ==}
    engines: {node: '>=6'}
@ -16512,6 +16518,8 @@ snapshots:

  toidentifier@1.0.1: {}

+  toml@3.0.0: {}
+
  totalist@3.0.1: {}

  tough-cookie@4.1.3: