Skip to content

Commit e5b91c2

Browse files
authoredAug 18, 2024··
fix: improved robots.txt matching (#136)
1 parent 4c1cd7c commit e5b91c2

File tree

18 files changed

+288
-119
lines changed

18 files changed

+288
-119
lines changed
 

‎.playground/pages/_dir/robots.txt

+4
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,9 @@ User-agent: *
22
Allow: /secret/exception
33
Disallow: /secret
44
Disallow: /admin
5+
Disallow: /*/hidden
6+
Disallow: /users/*/hidden
7+
Disallow: /?a=
8+
Disallow: /visible?*a=
59

610
Sitemap: /sitemap.xml
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<template>
2+
<div />
3+
</template>

‎.playground/pages/test3/index.vue

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<template>
2+
<div />
3+
</template>
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<template>
2+
<div>hidden</div>
3+
</template>

‎.playground/pages/visible.vue

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<template>
2+
<div>hidden</div>
3+
</template>

‎client/app.vue

+13-5
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ const pathDebug = useAsyncData<any>(() => {
3838
if (!appFetch.value || typeof appFetch.value !== 'function') {
3939
return null
4040
}
41-
const query: Record<string, any> = {
41+
const req: Record<string, any> = {
4242
path: path.value,
4343
}
4444
if (envTab.value === 'Production')
45-
query.mockProductionEnv = true
45+
req.mockProductionEnv = true
4646
return appFetch.value('/__robots__/debug-path.json', {
47-
query,
47+
query: req,
4848
})
4949
}, {
5050
watch: [envTab, path, appFetch],
@@ -211,17 +211,25 @@ const tab = useLocalStorage('nuxt-robots:tab', 'overview')
211211
<div v-else>
212212
<div class="inline-flex gap-3 mb-5 items-center">
213213
<div>
214-
<NIcon v-if="pathDebugData?.indexable" icon="carbon:checkmark-filled" class="text-green-300" />
214+
<NIcon v-if="pathDebugData?.allow" icon="carbon:checkmark-filled" class="text-green-300" />
215215
<NIcon v-else icon="carbon:warning-filled" class="text-red-300" />
216216
</div>
217-
<div v-if="pathDebugData?.indexable">
217+
<div v-if="pathDebugData?.allow">
218218
Robots can crawl <code class="opacity-60 text-sm">{{ path }}</code>.
219219
</div>
220220
<div v-else>
221221
Robots are blocked from crawling <code class="opacity-60 text-sm">{{ path }}</code>.
222222
</div>
223223
</div>
224224
<pre of-auto h-full text-sm style="white-space: break-spaces;" v-html="highlight(JSON.stringify(pathDebugData.rule || {}, null, 2), 'json')" />
225+
<div v-if="pathDebugData?.debug" class="mt-3 flex gap-2">
226+
<div v-if="pathDebugData?.debug?.source" class="text-xs text-gray-300 mt-3 border-gray-600 rounded-xl border-1 px-2 py-1 inline-flex">
227+
Source: {{ pathDebugData?.debug?.source }}
228+
</div>
229+
<div v-if="pathDebugData?.debug?.line" class="text-xs text-gray-300 mt-3 border-gray-600 rounded-xl border-1 px-2 py-1 inline-flex">
230+
{{ pathDebugData?.debug?.line }}
231+
</div>
232+
</div>
225233
</div>
226234
</div>
227235
</OSectionBlock>

‎client/composables/rpc.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ onDevtoolsClientConnected(async (client) => {
2020
})
2121
const $route = client.host.nuxt.vueApp.config.globalProperties?.$route
2222
query.value = $route.query
23-
path.value = $route.path || '/'
23+
path.value = $route?.fullPath || '/'
2424
client.host.nuxt.$router.afterEach((route) => {
2525
query.value = route.query
26-
path.value = route.path
26+
path.value = route.fullPath
2727
refreshSources()
2828
})
2929
devtools.value = client.devtools

‎src/module.ts

+21-19
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@ import { installNuxtSiteConfig, updateSiteConfig } from 'nuxt-site-config-kit'
1414
import { relative } from 'pathe'
1515
import type { Preset } from 'unimport'
1616
import { readPackageJSON } from 'pkg-types'
17-
import { asArray, indexableFromGroup, normaliseRobotsRouteRule, parseRobotsTxt, validateRobots } from './runtime/util'
17+
import {
18+
asArray,
19+
normaliseRobotsRouteRule,
20+
normalizeGroup,
21+
parseRobotsTxt,
22+
validateRobots,
23+
} from './runtime/util'
1824
import { extendTypes, isNuxtGenerate, resolveNitroPreset } from './kit'
1925
import type { Arrayable, AutoI18nConfig, Robots3Rules, RobotsGroupInput, RobotsGroupResolved } from './runtime/types'
2026
import { NonHelpfulBots } from './const'
@@ -338,12 +344,7 @@ export default defineNuxtModule<ModuleOptions>({
338344
config.disallow = asArray(config.disallow)
339345
config.allow = asArray(config.allow)
340346
// make sure any groups have a user agent, if not we set it to *
341-
config.groups = config.groups.map((group) => {
342-
group.userAgent = group.userAgent ? asArray(group.userAgent) : ['*']
343-
group.disallow = asArray(group.disallow)
344-
group.allow = asArray(group.allow)
345-
return group
346-
})
347+
config.groups = config.groups.map(normalizeGroup)
347348
// find an existing stack with a user agent that is equal to "['*']"
348349
const existingGroup = (config.groups as RobotsGroupResolved[]).find(stack => stack.userAgent.length === 1 && stack.userAgent[0] === '*')
349350
if (existingGroup) {
@@ -371,16 +372,16 @@ export default defineNuxtModule<ModuleOptions>({
371372
// convert robot routeRules to header routeRules for static hosting
372373
if (config.header) {
373374
Object.entries(nuxt.options.routeRules).forEach(([route, rules]) => {
374-
const url = route.split('/').map(segment => segment.startsWith(':') ? '*' : segment).join('/')
375-
const groupIndexable = indexableFromGroup(config.groups, url)
376-
const robotRules = normaliseRobotsRouteRule(rules, groupIndexable, config.robotsDisabledValue, config.robotsEnabledValue)
377-
// single * is supported but ignored
375+
const robotRule = normaliseRobotsRouteRule(rules)
376+
// only if a rule has been specified as robots.txt will cover disallows
377+
if (robotRule && !robotRule.allow && robotRule.rule) {
378378
// @ts-expect-error untyped
379-
nuxt.options.routeRules[route] = defu({
380-
headers: {
381-
'X-Robots-Tag': robotRules.rule,
382-
},
383-
}, nuxt.options.routeRules?.[route])
379+
nuxt.options.routeRules[route] = defu({
380+
headers: {
381+
'X-Robots-Tag': robotRule.rule,
382+
},
383+
}, nuxt.options.routeRules?.[route])
384+
}
384385
})
385386
}
386387

@@ -389,9 +390,8 @@ export default defineNuxtModule<ModuleOptions>({
389390
// iterate the route rules and add any non indexable rules to disallow
390391
Object.entries(nuxt.options.routeRules || {}).forEach(([route, rules]) => {
391392
const url = route.split('/').map(segment => segment.startsWith(':') ? '*' : segment).join('/')
392-
const groupIndexable = indexableFromGroup(config.groups, url)
393-
const { indexable } = normaliseRobotsRouteRule(rules, groupIndexable, config.robotsDisabledValue, config.robotsEnabledValue)
394-
if (!indexable) {
393+
const robotsRule = normaliseRobotsRouteRule(rules)
394+
if (robotsRule && !robotsRule.allow) {
395395
// single * is supported but ignored
396396
extraDisallows.add(url.replaceAll('**', '*'))
397397
}
@@ -410,6 +410,8 @@ export default defineNuxtModule<ModuleOptions>({
410410
}
411411
}
412412

413+
config.groups = config.groups.map(normalizeGroup)
414+
413415
nuxt.options.runtimeConfig['nuxt-robots'] = {
414416
version: version || '',
415417
usingNuxtContent: hasNuxtModule('@nuxt/content'),
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import type { H3Event } from 'h3'
22
import { withoutTrailingSlash } from 'ufo'
3-
import { createNitroRouteRuleMatcher, withoutQuery } from '../kit'
4-
import { indexableFromGroup, normaliseRobotsRouteRule } from '../../util'
5-
import { useNitroApp, useRuntimeConfig } from '#imports'
3+
import { useNitroApp } from 'nitropack/runtime'
4+
import { getRequestHeader } from 'h3'
5+
import { createNitroRouteRuleMatcher } from '../kit'
6+
import { matchPathToRule, normaliseRobotsRouteRule } from '../../util'
7+
import { useRuntimeConfig } from '#imports'
68
import { getSiteRobotConfig } from '#internal/nuxt-robots'
79

8-
export function getPathRobotConfig(e: H3Event, options?: { skipSiteIndexable?: boolean, path?: string }) {
10+
export function getPathRobotConfig(e: H3Event, options?: { userAgent?: string, skipSiteIndexable?: boolean, path?: string }) {
911
// has already been resolved
1012
const { robotsDisabledValue, robotsEnabledValue, usingNuxtContent } = useRuntimeConfig()['nuxt-robots']
1113
if (!options?.skipSiteIndexable) {
@@ -16,14 +18,64 @@ export function getPathRobotConfig(e: H3Event, options?: { skipSiteIndexable?: b
1618
}
1719
}
1820
}
19-
const path = withoutQuery(options?.path || e.path)
21+
const path = options?.path || e.path
22+
const userAgent = options?.userAgent || getRequestHeader(e, 'User-Agent')
2023
const nitroApp = useNitroApp()
24+
// 1. robots txt no indexing
25+
const groups = [
26+
// run explicit user agent matching first
27+
...nitroApp._robots.ctx.groups.filter((g) => {
28+
if (userAgent) {
29+
return g.userAgent.some(ua => ua.toLowerCase().includes(userAgent.toLowerCase()))
30+
}
31+
return false
32+
}),
33+
// run wildcard matches second
34+
...nitroApp._robots.ctx.groups.filter(g => g.userAgent.includes('*')),
35+
]
36+
for (const group of groups) {
37+
const robotsTxtRule = matchPathToRule(path, group._rules)
38+
if (robotsTxtRule) {
39+
if (!robotsTxtRule.allow) {
40+
return {
41+
allow: false,
42+
rule: robotsDisabledValue,
43+
debug: {
44+
source: '/robots.txt',
45+
line: `Disallow: ${robotsTxtRule.pattern}`,
46+
},
47+
}
48+
}
49+
// exit loop continue to other checks (explicit robots allows)
50+
break
51+
}
52+
}
53+
54+
// 2. nuxt content rules
55+
if (usingNuxtContent && nitroApp._robots?.nuxtContentUrls?.has(withoutTrailingSlash(path))) {
56+
return {
57+
allow: false,
58+
rule: robotsDisabledValue,
59+
debug: {
60+
source: 'Nuxt Content',
61+
},
62+
}
63+
}
64+
65+
// 3. nitro route rules
2166
nitroApp._robotsRuleMactcher = nitroApp._robotsRuleMactcher || createNitroRouteRuleMatcher()
22-
const routeRules = nitroApp._robotsRuleMactcher(path)
23-
let defaultIndexable = indexableFromGroup(nitroApp._robots.ctx.groups, path)
24-
if (usingNuxtContent) {
25-
if (nitroApp._robots?.nuxtContentUrls?.has(withoutTrailingSlash(path)))
26-
defaultIndexable = false
67+
const routeRules = normaliseRobotsRouteRule(nitroApp._robotsRuleMactcher(path))
68+
if (routeRules) {
69+
return {
70+
allow: routeRules.allow,
71+
rule: routeRules.rule || (routeRules.allow ? robotsEnabledValue : robotsDisabledValue),
72+
debug: {
73+
source: 'Route Rules',
74+
},
75+
}
76+
}
77+
return {
78+
allow: true,
79+
rule: robotsEnabledValue,
2780
}
28-
return normaliseRobotsRouteRule(routeRules, defaultIndexable, robotsDisabledValue, robotsEnabledValue)
2981
}

‎src/runtime/nitro/server/__robots__/debug-path.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { getPathRobotConfig } from '#internal/nuxt-robots'
33

44
export default defineEventHandler(async (e) => {
55
const path = getQuery(e).path as string
6-
return await getPathRobotConfig(e, {
6+
return getPathRobotConfig(e, {
77
path,
88
})
99
})

‎src/runtime/types.ts

+3
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ export interface RobotsGroupResolved {
4646
cleanParam?: string[]
4747
// nuxt-simple-robots internals
4848
_skipI18n?: boolean
49+
// runtime optimization
50+
_indexable: boolean
51+
_rules: { pattern: string, allow: boolean }[]
4952
}
5053

5154
export interface HookRobotsTxtContext {

0 commit comments

Comments
 (0)
Please sign in to comment.