Skip to content

Commit f98c216

Browse files
committedOct 15, 2024·
fix: warn when unknown directives are used
1 parent f4d6072 commit f98c216

File tree

5 files changed

+54
-16
lines changed

5 files changed

+54
-16
lines changed
 

Diff for: ‎src/module.ts

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import type { Preset } from 'unimport'
2+
import type { Arrayable, AutoI18nConfig, Robots3Rules, RobotsGroupInput, RobotsGroupResolved } from './runtime/types'
13
import fsp from 'node:fs/promises'
24
import {
35
addComponent,
@@ -12,21 +14,19 @@ import {
1214
import { defu } from 'defu'
1315
import { installNuxtSiteConfig, updateSiteConfig } from 'nuxt-site-config-kit'
1416
import { relative } from 'pathe'
15-
import type { Preset } from 'unimport'
1617
import { readPackageJSON } from 'pkg-types'
18+
import { NonHelpfulBots } from './const'
19+
import { setupDevToolsUI } from './devtools'
20+
import { resolveI18nConfig, splitPathForI18nLocales } from './i18n'
21+
import { extendTypes, isNuxtGenerate, resolveNitroPreset } from './kit'
22+
import { logger } from './logger'
1723
import {
1824
asArray,
1925
normaliseRobotsRouteRule,
2026
normalizeGroup,
2127
parseRobotsTxt,
2228
validateRobots,
2329
} from './runtime/util'
24-
import { extendTypes, isNuxtGenerate, resolveNitroPreset } from './kit'
25-
import type { Arrayable, AutoI18nConfig, Robots3Rules, RobotsGroupInput, RobotsGroupResolved } from './runtime/types'
26-
import { NonHelpfulBots } from './const'
27-
import { resolveI18nConfig, splitPathForI18nLocales } from './i18n'
28-
import { setupDevToolsUI } from './devtools'
29-
import { logger } from './logger'
3030

3131
export interface ModuleOptions {
3232
/**
@@ -301,7 +301,7 @@ export default defineNuxtModule<ModuleOptions>({
301301
const path = relative(nuxt.options.rootDir, usingRobotsTxtPath)
302302
logger.debug(`A robots.txt file was found at \`./${path}\`, merging config.`)
303303
const parsedRobotsTxt = parseRobotsTxt(robotsTxt)
304-
const errors = validateRobots(parsedRobotsTxt)
304+
const { errors } = validateRobots(parsedRobotsTxt)
305305
if (errors.length > 0) {
306306
logger.error(`The \`./${path}\` file contains errors:`)
307307
for (const error of errors)

Diff for: ‎src/runtime/types.ts

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export type Arrayable<T> = T | T[]
55
export interface ParsedRobotsTxt {
66
groups: RobotsGroupResolved[]
77
sitemaps: string[]
8+
errors: string[]
89
}
910

1011
export type RobotsGroupInput = GoogleInput | YandexInput

Diff for: ‎src/runtime/util.ts

+25-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import { createDefu } from 'defu'
21
import type { NitroRouteConfig } from 'nitropack'
3-
import { withoutLeadingSlash } from 'ufo'
42
import type { ParsedRobotsTxt, RobotsGroupInput, RobotsGroupResolved } from './types'
3+
import { createDefu } from 'defu'
4+
import { withoutLeadingSlash } from 'ufo'
55

66
/**
77
* We're going to read the robots.txt and extract any disallow or sitemaps rules from it.
@@ -15,20 +15,24 @@ import type { ParsedRobotsTxt, RobotsGroupInput, RobotsGroupResolved } from './t
1515
* - host: the host name of the site, this is optional non-standard directive.
1616
*
1717
* @see https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt
18+
* @see https://github.com/google/robotstxt/blob/86d5836ba2d5a0b6b938ab49501be0e09d9c276c/robots.cc#L714C1-L720C2
1819
*/
1920
export function parseRobotsTxt(s: string): ParsedRobotsTxt {
2021
// then we'll extract the disallow and sitemap rules
2122
const groups: RobotsGroupResolved[] = []
2223
const sitemaps: string[] = []
24+
const errors: string[] = []
2325
let createNewGroup = false
2426
let currentGroup: RobotsGroupResolved = {
2527
comment: [], // comments are too hard to parse in a logical order, we'll just omit them
2628
disallow: [],
2729
allow: [],
2830
userAgent: [],
2931
}
32+
let ln = -1
3033
// read the contents
3134
for (const line of s.split('\n')) {
35+
ln++
3236
const sepIndex = line.indexOf(':')
3337
// may not exist for comments
3438
if (sepIndex === -1)
@@ -39,6 +43,8 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
3943

4044
switch (rule) {
4145
case 'user-agent':
46+
case 'useragent':
47+
case 'user agent':
4248
if (createNewGroup) {
4349
groups.push({
4450
...currentGroup,
@@ -58,20 +64,32 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
5864
createNewGroup = true
5965
break
6066
case 'disallow':
67+
case 'dissallow':
68+
case 'dissalow':
69+
case 'disalow':
70+
case 'diasllow':
71+
case 'disallaw':
6172
currentGroup.disallow.push(val)
6273
createNewGroup = true
6374
break
6475
case 'sitemap':
76+
case 'site-map':
6577
sitemaps.push(val)
6678
break
6779
case 'host':
6880
currentGroup.host = val
6981
break
7082
case 'clean-param':
71-
if (currentGroup.userAgent.includes('Yandex')) {
83+
if (currentGroup.userAgent.some(u => u.toLowerCase().includes('yandex'))) {
7284
currentGroup.cleanParam = currentGroup.cleanParam || []
7385
currentGroup.cleanParam.push(val)
7486
}
87+
else {
88+
errors.push(`L${ln}: Clean-param directive is only when targeting Yandex user agent.`)
89+
}
90+
break
91+
default:
92+
errors.push(`L${ln}: Unknown directive ${rule} `)
7593
break
7694
}
7795
}
@@ -82,6 +100,7 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
82100
return {
83101
groups,
84102
sitemaps,
103+
errors,
85104
}
86105
}
87106

@@ -167,17 +186,16 @@ export function matchPathToRule(path: string, _rules: RobotsGroupResolved['_rule
167186
}
168187

169188
export function validateRobots(robotsTxt: ParsedRobotsTxt) {
170-
const errors: string[] = []
171189
// 1. check that the include / exclude is either empty or starts with a slash OR a wildcard
172190
robotsTxt.groups = robotsTxt.groups.filter((group) => {
173191
if (!group.allow.length && !group.disallow.length) {
174-
errors.push(`Group "${group.userAgent.join(', ')}" has no allow or disallow rules. You must provide one of either.`)
192+
robotsTxt.errors.push(`Group "${group.userAgent.join(', ')}" has no allow or disallow rules. You must provide one of either.`)
175193
return false
176194
}
177-
validateGroupRules(group, errors)
195+
validateGroupRules(group, robotsTxt.errors)
178196
return true
179197
})
180-
return errors
198+
return robotsTxt
181199
}
182200

183201
export function asArray(v: any) {

Diff for: ‎test/unit/robotsTxtParser.test.ts

+18
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ describe('robotsTxtParser', () => {
88
const robotsTxt = await fsp.readFile('./test/fixtures/yoastRobots.txt', { encoding: 'utf-8' })
99
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
1010
{
11+
"errors": [],
1112
"groups": [
1213
{
1314
"allow": [],
@@ -58,6 +59,7 @@ describe('robotsTxtParser', () => {
5859
const robotsTxt = await fsp.readFile('./test/fixtures/squareSpace.txt', { encoding: 'utf-8' })
5960
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
6061
{
62+
"errors": [],
6163
"groups": [
6264
{
6365
"allow": [
@@ -224,6 +226,7 @@ describe('robotsTxtParser', () => {
224226
const robotsTxt = await fsp.readFile('./test/fixtures/issue36.txt', { encoding: 'utf-8' })
225227
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
226228
{
229+
"errors": [],
227230
"groups": [
228231
{
229232
"allow": [],
@@ -249,6 +252,7 @@ describe('robotsTxtParser', () => {
249252
const robotsTxt = await fsp.readFile('./test/fixtures/yandex.txt', { encoding: 'utf-8' })
250253
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
251254
{
255+
"errors": [],
252256
"groups": [
253257
{
254258
"allow": [],
@@ -287,6 +291,7 @@ describe('robotsTxtParser', () => {
287291
const robotsTxt = await fsp.readFile('./test/fixtures/startgroupRobots.txt', { encoding: 'utf-8' })
288292
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
289293
{
294+
"errors": [],
290295
"groups": [
291296
{
292297
"allow": [
@@ -337,4 +342,17 @@ describe('robotsTxtParser', () => {
337342
}
338343
`)
339344
})
345+
346+
it('collects errors', async () => {
347+
// read fixture startgroupRobots.txt
348+
expect(parseRobotsTxt(`
349+
User-Agent: *
350+
Disallow: /foo
351+
Unknown: /bar
352+
`).errors).toMatchInlineSnapshot(`
353+
[
354+
"L3: Unknown directive unknown ",
355+
]
356+
`)
357+
})
340358
})

Diff for: ‎test/unit/robotsTxtValidator.test.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import { validateRobots } from '../../src/runtime/util'
33

44
describe('robotsTxtValidator', () => {
55
it('basic', async () => {
6-
const errors = validateRobots({
6+
const { errors } = validateRobots({
7+
errors: [],
78
sitemaps: [],
89
groups: [
910
{

0 commit comments

Comments
 (0)
Please sign in to comment.