Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reading comments with UTF chars (fixes #238) #240

Merged
merged 2 commits into from Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 31 additions & 8 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Expand Up @@ -2981,8 +2981,8 @@ private void parseComment()
// implements XML 1.0 Section 2.5 Comments

// ASSUMPTION: seen <!-
char ch = more();
if ( ch != '-' )
char cch = more();
if ( cch != '-' )
throw new XmlPullParserException( "expected <!-- for comment start", this, null );
if ( tokenize )
posStart = pos;
Expand All @@ -2999,7 +2999,19 @@ private void parseComment()
while ( true )
{
// scan until it hits -->
ch = more();
cch = more();
int ch;
char cch2;
if ( Character.isHighSurrogate( cch ) )
{
cch2 = more();
ch = Character.toCodePoint( cch, cch2 );
}
else
{
cch2 = 0;
ch = cch;
}
if ( seenDashDash && ch != '>' )
{
throw new XmlPullParserException( "in comment after two dashes (--) next character must be >"
Expand Down Expand Up @@ -3074,7 +3086,11 @@ else if ( ch == '\n' )
{
if ( pcEnd >= pc.length )
ensurePC( pcEnd );
pc[pcEnd++] = ch;
pc[pcEnd++] = cch;
if ( cch2 != 0 )
{
pc[pcEnd++] = cch2;
}
}
normalizedCR = false;
}
Expand Down Expand Up @@ -4153,7 +4169,7 @@ private static boolean isS( char ch )
// ch != '\u0000' ch < '\uFFFE'

// private char printable(char ch) { return ch; }
private static String printable( char ch )
private static String printable( int ch )
{
if ( ch == '\n' )
{
Expand All @@ -4175,18 +4191,25 @@ else if ( ch == '\'' )
{
return "\\u" + Integer.toHexString( ch );
}
return "" + ch;
if ( Character.isBmpCodePoint( ch ) )
{
return Character.toString( ( char ) ch );
}
else
{
return new String( new char[] { Character.highSurrogate( ch ), Character.lowSurrogate( ch ) } );
}
}

private static String printable( String s )
{
if ( s == null )
return null;
final int sLen = s.length();
final int sLen = s.codePointCount(0, s.length());
StringBuilder buf = new StringBuilder( sLen + 10 );
for ( int i = 0; i < sLen; ++i )
{
buf.append( printable( s.charAt( i ) ) );
buf.append( printable( s.codePointAt( i ) ) );
}
s = buf.toString();
return s;
Expand Down
26 changes: 26 additions & 0 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Expand Up @@ -1511,4 +1511,30 @@ public void testReplacementInPCArrayWithShorterCharArray()
fail( "should not raise exception: " + e );
}
}

/**
* Ensures emoji can be parsed correctly
*/
@Test
public void testUnicode() throws IOException {
String input = "<project><!--ALL TEH BOMS! \uD83D\uDCA3 --></project>";

try
{
MXParser parser = new MXParser();
parser.setInput( new StringReader( input ) );

assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( "project", parser.getName() );
assertEquals( XmlPullParser.COMMENT, parser.nextToken() );
assertEquals( "ALL TEH BOMS! \uD83D\uDCA3 ", parser.getText() );
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
assertEquals( "project", parser.getName() );
}
catch ( XmlPullParserException e )
{
e.printStackTrace();
fail( "should not raise exception: " + e );
}
}
}