1
+ import type { UseHeadInput } from 'unhead'
2
+
3
+ const Attrs = / ( \w + ) (?: = [ " ' ] ( [ ^ " ' ] * ) [ " ' ] ) ? / g
4
+ const HtmlTag = / < h t m l [ ^ > ] * > /
5
+ const BodyTag = / < b o d y [ ^ > ] * > /
6
+ const HeadContent = / < h e a d [ ^ > ] * > ( .* ?) < \/ h e a d > / s
7
+ const SelfClosingTags = / < ( m e t a | l i n k | b a s e ) [ ^ > ] * > / g
8
+ const ClosingTags = / < ( t i t l e | s c r i p t | s t y l e ) [ ^ > ] * > [ \s \S ] * ?< \/ \1> / g
9
+ // eslint-disable-next-line regexp/no-misleading-capturing-group
10
+ const NewLines = / ( \n \s * ) + / g
11
+
1
12
function extractAttributes ( tag : string ) {
2
- const attrs = tag . match ( / ( [ a - z - ] + ) = " ( [ ^ " ] * ) " / g)
13
+ // inner should be between the < and > (non greedy), split on ' ' and after index 0
14
+ const inner = tag . match ( / < ( [ ^ > ] * ) > / ) ?. [ 1 ] . split ( ' ' ) . slice ( 1 ) . join ( ' ' )
15
+ if ( ! inner )
16
+ return { }
17
+ const attrs = inner . match ( Attrs )
3
18
return attrs ?. reduce ( ( acc , attr ) => {
4
- const [ key , ... valueParts ] = attr . split ( '=' )
5
- // join value parts to support '=' within the quoted values
6
- const val = valueParts . join ( '=' ) . slice ( 1 , - 1 )
19
+ const sep = attr . indexOf ( '=' )
20
+ const key = sep > 0 ? attr . slice ( 0 , sep ) : attr
21
+ const val = sep > 0 ? attr . slice ( sep + 1 ) . slice ( 1 , - 1 ) : true
7
22
return { ...acc , [ key ] : val , tagPriority : 'low' }
8
- } , { } )
23
+ } , { } ) || { }
9
24
}
10
25
11
26
export function extractTagsFromHtml ( html : string ) {
12
- const input = { }
13
- // i should be able to give it a string of html and it should convert it to input for useHead()
14
- // parse htmlAttrs, bodyAttrs
15
- input . htmlAttrs = extractAttributes ( html . match ( / < h t m l [ ^ > ] * > / ) ?. [ 0 ] || '' )
27
+ const input : UseHeadInput < any > = { }
28
+ input . htmlAttrs = extractAttributes ( html . match ( HtmlTag ) ?. [ 0 ] || '' )
29
+ html = html . replace ( HtmlTag , '<html>' )
30
+
31
+ input . bodyAttrs = extractAttributes ( html . match ( BodyTag ) ?. [ 0 ] || '' )
32
+ html = html . replace ( BodyTag , '<body>' )
16
33
17
- html = html . replace ( / < h t m l [ ^ > ] * > / , '<html>' )
18
- input . bodyAttrs = extractAttributes ( html . match ( / < b o d y [ ^ > ] * > / ) ?. [ 0 ] || '' )
19
- html = html . replace ( / < b o d y [ ^ > ] * > / , '<body>' )
20
- // parse headTags, need to split on /> and seperate each tag
21
- const innerHead = html . match ( / < h e a d [ ^ > ] * > ( [ \s \S ] * ) < \/ h e a d > / ) ?. [ 1 ]
22
- // replace ['meta', 'link', 'base'] tags first because they're unique in that they don't have a closing tag
23
- innerHead ?. match ( / < m e t a [ ^ > ] * > | < l i n k [ ^ > ] * > | < b a s e [ ^ > ] * > / g) . forEach ( ( s ) => {
34
+ const innerHead = html . match ( HeadContent ) ?. [ 1 ] || ''
35
+ innerHead . match ( SelfClosingTags ) ?. forEach ( ( s ) => {
24
36
html = html . replace ( s , '' )
25
- const tag = s . split ( ' ' ) [ 0 ] . slice ( 1 )
37
+ const tag = s . split ( ' ' ) [ 0 ] . slice ( 1 ) as 'meta'
26
38
input [ tag ] = input [ tag ] || [ ]
27
- input [ tag ] . push ( extractAttributes ( s ) )
39
+ input [ tag ] . push ( extractAttributes ( s ) as any )
28
40
} )
29
- innerHead ?. match ( / < t i t l e [ ^ > ] * > [ \s \S ] * ?< \/ t i t l e > | < s c r i p t [ ^ > ] * > [ \s \S ] * ?< \/ s c r i p t > | < s t y l e [ ^ > ] * > [ \s \S ] * ?< \/ s t y l e > / g)
30
- . map ( tag => tag . trim ( ) )
41
+
42
+ innerHead . match ( ClosingTags )
43
+ ?. map ( tag => tag . trim ( ) )
31
44
. filter ( Boolean )
32
45
. forEach ( ( tag ) => {
33
46
html = html . replace ( tag , '' )
34
- const type = tag . match ( / < ( [ a - z - ] + ) / ) ?. [ 1 ]
47
+ const type = tag . match ( / < ( [ a - z - ] + ) / ) ?. [ 1 ] as 'script' | 'title'
35
48
const res = {
36
49
tagPriority : 'low' ,
37
- [ type !== 'script' ? 'textContent' : 'innerHTML' ] : tag . match ( / > ( [ \s \S ] * ) < / ) ?. [ 1 ] ,
38
50
...extractAttributes ( tag ) ,
51
+ } as any
52
+ const innerContent = tag . match ( / > ( [ \s \S ] * ) < / ) ?. [ 1 ]
53
+ if ( innerContent ) {
54
+ res [ type !== 'script' ? 'textContent' : 'innerHTML' ] = innerContent
39
55
}
40
56
if ( type === 'title' ) {
41
57
input . title = res
@@ -45,8 +61,7 @@ export function extractTagsFromHtml(html: string) {
45
61
input [ type ] . push ( res )
46
62
}
47
63
} )
48
- // remove duplicate new lines from html, could be 2, 5 or 20 in a row
49
- html = html . replace ( / ( \n \s * ) + / g, '\n' )
50
- // we leave any body tags as the order is out of our control
64
+
65
+ html = html . replace ( NewLines , '\n' )
51
66
return { html, input }
52
67
}
0 commit comments