Skip to content

Commit

Permalink
Fix reading comments with UTF chars (fixes #238) (#240)
Browse files Browse the repository at this point in the history
* Fix reading comments with UTF chars (fixes #238)
* Fix printable methods to account for UTF chars
  • Loading branch information
gnodet committed Mar 1, 2023
1 parent d3d137c commit 32b72a6
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
39 changes: 31 additions & 8 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Expand Up @@ -2981,8 +2981,8 @@ private void parseComment()
// implements XML 1.0 Section 2.5 Comments

// ASSUMPTION: seen <!-
char ch = more();
if ( ch != '-' )
char cch = more();
if ( cch != '-' )
throw new XmlPullParserException( "expected <!-- for comment start", this, null );
if ( tokenize )
posStart = pos;
Expand All @@ -2999,7 +2999,19 @@ private void parseComment()
while ( true )
{
// scan until it hits -->
ch = more();
cch = more();
int ch;
char cch2;
if ( Character.isHighSurrogate( cch ) )
{
cch2 = more();
ch = Character.toCodePoint( cch, cch2 );
}
else
{
cch2 = 0;
ch = cch;
}
if ( seenDashDash && ch != '>' )
{
throw new XmlPullParserException( "in comment after two dashes (--) next character must be >"
Expand Down Expand Up @@ -3074,7 +3086,11 @@ else if ( ch == '\n' )
{
if ( pcEnd >= pc.length )
ensurePC( pcEnd );
pc[pcEnd++] = ch;
pc[pcEnd++] = cch;
if ( cch2 != 0 )
{
pc[pcEnd++] = cch2;
}
}
normalizedCR = false;
}
Expand Down Expand Up @@ -4153,7 +4169,7 @@ private static boolean isS( char ch )
// ch != '\u0000' ch < '\uFFFE'

// private char printable(char ch) { return ch; }
private static String printable( char ch )
private static String printable( int ch )
{
if ( ch == '\n' )
{
Expand All @@ -4175,18 +4191,25 @@ else if ( ch == '\'' )
{
return "\\u" + Integer.toHexString( ch );
}
return "" + ch;
if ( Character.isBmpCodePoint( ch ) )
{
return Character.toString( ( char ) ch );
}
else
{
return new String( new char[] { Character.highSurrogate( ch ), Character.lowSurrogate( ch ) } );
}
}

private static String printable( String s )
{
if ( s == null )
return null;
final int sLen = s.length();
final int sLen = s.codePointCount(0, s.length());
StringBuilder buf = new StringBuilder( sLen + 10 );
for ( int i = 0; i < sLen; ++i )
{
buf.append( printable( s.charAt( i ) ) );
buf.append( printable( s.codePointAt( i ) ) );
}
s = buf.toString();
return s;
Expand Down
26 changes: 26 additions & 0 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Expand Up @@ -1511,4 +1511,30 @@ public void testReplacementInPCArrayWithShorterCharArray()
fail( "should not raise exception: " + e );
}
}

/**
* Ensures emoji can be parsed correctly
*/
@Test
public void testUnicode() throws IOException {
String input = "<project><!--ALL TEH BOMS! \uD83D\uDCA3 --></project>";

try
{
MXParser parser = new MXParser();
parser.setInput( new StringReader( input ) );

assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( "project", parser.getName() );
assertEquals( XmlPullParser.COMMENT, parser.nextToken() );
assertEquals( "ALL TEH BOMS! \uD83D\uDCA3 ", parser.getText() );
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
assertEquals( "project", parser.getName() );
}
catch ( XmlPullParserException e )
{
e.printStackTrace();
fail( "should not raise exception: " + e );
}
}
}

0 comments on commit 32b72a6

Please sign in to comment.