Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mappings between LCIDs and code pages. #169

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 6 additions & 6 deletions internal/cp/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,31 +60,31 @@ func collation2charset(col Collation) *charsetMap {
switch col.getLcid() {
case 0x001e, 0x041e:
return cp874
case 0x0411, 0x10411:
case 0x0411, 0x10411, 0x40411:
return cp932
case 0x0804, 0x1004, 0x20804:
return cp936
case 0x0012, 0x0412:
return cp949
case 0x0404, 0x1404, 0x0c04, 0x7c04, 0x30404:
case 0x0404, 0x1404, 0x0c04, 0x7c04, 0x30404, 0x21404:
return cp950
case 0x041c, 0x041a, 0x0405, 0x040e, 0x104e, 0x0415, 0x0418, 0x041b, 0x0424, 0x1040e:
case 0x041c, 0x041a, 0x0405, 0x040e, 0x104e, 0x0415, 0x0418, 0x041b, 0x0424, 0x1040e, 0x0442, 0x081A, 0x141A:
return cp1250
case 0x0423, 0x0402, 0x042f, 0x0419, 0x081a, 0x0c1a, 0x0422, 0x043f, 0x0444, 0x082c:
case 0x0423, 0x0402, 0x042f, 0x0419, 0x0c1a, 0x0422, 0x043f, 0x0444, 0x082c, 0x046D, 0x0485, 0x201A:
return cp1251
case 0x0408:
return cp1253
case 0x041f, 0x042c, 0x0443:
return cp1254
case 0x040d:
return cp1255
case 0x0401, 0x0801, 0xc01, 0x1001, 0x1401, 0x1801, 0x1c01, 0x2001, 0x2401, 0x2801, 0x2c01, 0x3001, 0x3401, 0x3801, 0x3c01, 0x4001, 0x0429, 0x0420:
case 0x0401, 0x0801, 0xc01, 0x1001, 0x1401, 0x1801, 0x1c01, 0x2001, 0x2401, 0x2801, 0x2c01, 0x3001, 0x3401, 0x3801, 0x3c01, 0x4001, 0x0429, 0x0420, 0x0480, 0x048C:
return cp1256
case 0x0425, 0x0426, 0x0427, 0x0827:
return cp1257
case 0x042a:
return cp1258
case 0x0439, 0x045a, 0x0465:
case 0x0439, 0x045a, 0x0465, 0x043A, 0x0445, 0x044D, 0x0451, 0x0453, 0x0454, 0x0461, 0x0463, 0x0481:
return nil
}
return cp1252
Expand Down
345 changes: 345 additions & 0 deletions lcids_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
package mssql

import (
"bytes"
"database/sql"
"encoding/binary"
"testing"
)

// SQL that generates other SQLs for running comparisons.
// It does not create any persisent database objects,
// employing CTEs instead. The structure of CTEs is pretty
// self-explanatory, with one CTE per code page containing
// all available LCIDs. The exception is code page 1252, which
// contains all LCIDs not included in all other CTEs.
//
// There are some test integrity assertions that:
// 1. Ensure that all LCIDs in the system (as returned by sys.fn_helpcollations())
// are covered by the test.
// 2. The 1252 CTE does not return any "leaked" code pages that are not 1252.
// 3. All code pages come with sample reference data for fetching comparison.
//
// None of the SQL syntax uses any SQL Server version-specific syntax,
// and should work on any version without change.
const comparisonQueriesGeneratorSQL = `
--
-- Generates SELECT statements for fetching data encoded with various code pages
-- along with the same data encoded in UTF-16.
-- The client can execute the generated queries and compare "codepage_data" column
-- with the original data returned in the "reference_data" column.
--
-- Single-byte code pages contain the entire range of 128-255 byte values as sample data.
-- Double-byte code pages contain a representative sample text since it is not practical
-- to provide the entire range of possible characters.
--
-- The format of the query is
-- SELECT
-- N'<sample data>' AS [reference_data]
-- , CAST(N'<sample data> COLLATE <collation with the given code page and lcid> AS VARCHAR(1000)) AS [codepage_data]
-- This way the conversion from Nvarchar to Varchar does not depend on the collation
-- of the currently active database.
--
-- The dirver's job is to fetch both columns and compare them. If the codepage/LCID mapping is done wrong,
-- the comparison will fail.
--
with
cte_data (cp, datasample) as (
-- Thai
select 874, N'€…‘’“”•–—กขฃคฅฆงจฉชซฌฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛'
union all
-- Japanese (double-byte encoding, so provide a sample rather than the entire range)
select 932, N'産業通商資源部の安徳根(アン・ドクグン)長官は「今後もモバイル・ワールド・コングレス(MWC)など海外の見本市で統合韓国館を拡大し、参加企業の成果を高める」との方針を示した。'
union all
-- Chinese Simplified (double-byte encoding, so provide a sample rather than the entire range)
select 936, N'乘坐“蓝梦之星”号邮轮访问济州的中国团体游客造访新罗免税店济州分店。'
union all
-- Chinese Traditional (double-byte encoding, so provide a sample rather than the entire range)
select 950, N'首相弗雷澤里克森在首相府和國會所在地克里斯蒂安堡宮的露台上,向民眾公布王儲正式登基成為國王。'
union all
-- Korean (double-byte encoding, so provide a sample rather than the entire range)
select 949, N'홍성은 마늘과 한돈, 김 등 산지로 유명하지만, 그동안 상대적으로 시설원예 분야에서는 취약하다는 평가를 받았다.'
union all
-- Central European (Czech, Slovak, Polish, Hungarian, etc.)
select 1250, N'€‚„…†‡‰Š‹ŚŤŽŹ‘’“”•–—™š›śťžźˇ˘Ł¤Ą¦§¨©Ş«¬®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙'
union all
-- Cyrillic
select 1251, N'ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—™љ›њќћџЎўЈ¤Ґ¦§Ё©Є«¬®Ї°±Ііґµ¶·ё№є»јЅѕїАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя'
union all
-- Generic Latin
select 1252, N'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
union all
-- Greek
select 1253, N'€‚ƒ„…†‡‰‹‘’“”•–—™›΅Ά£¤¥¦§¨©«¬®―°±²³΄µ¶·ΈΉΊ»Ό½ΎΏCxΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ'
union all
-- Turkish
select 1254, N'€‚ƒ„…†‡ˆ‰Š‹Œ‘’“”•–—˜™š›œŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ'
union all
-- Hebrew
select 1255, N'€‚ƒ„…†‡ˆ‰‹‘’“”•–—˜™›¡¢£₪¥¦§¨©×«¬®¯°±²³´µ¶·¸¹÷»¼½¾¿ְֱֲֳִֵֶַָֻּֽ־ֿ׀ׁׂ׃װױײ׳״אבגדהוזחטיךכלםמןנסעףפץצקרשת'
union all
-- Arabic
select 1256, N'€پ‚ƒ„…†‡ˆ‰ٹ‹Œچژڈگ‘’“”•–—ک™ڑ›œں،¢£¤¥¦§¨©ھ«¬®¯°±²³´µ¶·¸¹؛»¼½¾؟ہءآأؤإئابةتثجحخدذرزسشصض×طظعغـفقكàلâمنهوçèéêëىيîô÷ùûüے'
union all
-- Baltic countries (Estonia, Latvia, Lithuania)
select 1257, N'€‚„…†‡‰‹¨ˇ¸‘’“”•–—™›¯˛¢£¤¦§Ø©Ŗ«¬®Æ°±²³´µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž˙'
union all
-- Vietnamese
select 1258, N'€‚ƒ„…†‡ˆ‰‹Œ‘’“”•–—˜™›œŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂĂÄÅÆÇÈÉÊËÍÎÏĐÑÓÔƠÖ×ØÙÚÛÜƯßàáâăäåæçèéêëíîïđñóôơö÷øùúûüư₫ÿ'
),
cte_cp874 (cp, lcid, collation) as (
select 874, 0x0000041e, N'Thai_100_BIN2'
),
cte_cp932 (cp, lcid, collation) as (
select 932, 0x00000411, N'Japanese_XJIS_100_BIN2'
union all select 932, 0x00010411, N'Japanese_Unicode_BIN2'
union all select 932, 0x00040411, N'Japanese_Bushu_Kakusu_100_BIN2'
),
cte_cp936 (cp, lcid, collation) as (
select 936, 0x00000804, N'Chinese_Simplified_Pinyin_100_BIN2'
union all select 936, 0x00020804, N'Chinese_Simplified_Stroke_Order_100_BIN2'
),
cte_cp949 (cp, lcid, collation) as (
select 949, 0x00000412, N'Korean_100_BIN2'
),
cte_cp950 (cp, lcid, collation) as (
select 950, 0x00000404, N'Chinese_Traditional_Stroke_Count_100_BIN2'
union all select 950, 0x00001404, N'Chinese_Traditional_Pinyin_100_BIN2'
union all select 950, 0x00000c04, N'Chinese_Hong_Kong_Stroke_90_BIN2'
union all select 950, 0x00030404, N'Chinese_Traditional_Bopomofo_100_BIN2'
union all select 950, 0x00021404, N'Chinese_Traditional_Stroke_Order_100_BIN2'
),
cte_cp1250 (cp, lcid, collation) as (
select 1250, 0x0000041c, N'Albanian_100_BIN2'
union all select 1250, 0x0000041a, N'Croatian_100_BIN2'
union all select 1250, 0x00000405, N'Czech_100_BIN2'
union all select 1250, 0x0000040e, N'Hungarian_100_BIN2'
union all select 1250, 0x00000415, N'Polish_100_BIN2'
union all select 1250, 0x00000418, N'Romanian_100_BIN2'
union all select 1250, 0x0000041b, N'Slovak_100_BIN2'
union all select 1250, 0x00000424, N'Slovenian_100_BIN2'
union all select 1250, 0x0001040e, N'Hungarian_Technical_100_BIN2'
union all select 1250, 0x00000442, N'Turkmen_100_BIN2'
union all select 1250, 0x0000081A, N'Serbian_Latin_100_BIN2'
union all select 1250, 0x0000141A, N'Bosnian_Latin_100_BIN2'
),
cte_cp1251 (cp, lcid, collation) as (
select 1251, 0x0000042f, N'Macedonian_FYROM_100_BIN2'
union all select 1251, 0x00000419, N'Cyrillic_General_100_BIN2'
union all select 1251, 0x00000c1a, N'Serbian_Cyrillic_100_BIN2'
union all select 1251, 0x00000422, N'Ukrainian_100_BIN2'
union all select 1251, 0x0000043f, N'Kazakh_100_BIN2'
union all select 1251, 0x00000444, N'Tatar_100_BIN2'
union all select 1251, 0x0000082c, N'Azeri_Cyrillic_100_BIN2'
union all select 1251, 0x0000046D, N'Bashkir_100_BIN2'
union all select 1251, 0x00000485, N'Yakut_100_BIN2'
union all select 1251, 0x0000201A, N'Bosnian_Cyrillic_100_BIN2'
),
cte_cp1253 (cp, lcid, collation) as (
select 1253, 0x00000408, N'Greek_100_BIN2'
),
cte_cp1254 (cp, lcid, collation) as (
select 1254, 0x0000041f, N'Turkish_100_BIN2'
union all select 1254, 0x0000042c, N'Azeri_Latin_100_BIN2'
union all select 1254, 0x00000443, N'Uzbek_Latin_100_BIN2'
),
cte_cp1255 (cp, lcid, collation) as (
select 1255, 0x0000040d, N'Hebrew_100_BIN2'
),
cte_cp1256 (cp, lcid, collation) as (
select 1256, 0x00000401, N'Arabic_100_BIN2'
union all select 1256, 0x00000429, N'Persian_100_BIN2'
union all select 1256, 0x00000420, N'Urdu_100_BIN2'
union all select 1256, 0x00000480, N'Uighur_100_BIN2'
union all select 1256, 0x0000048C, N'Dari_100_BIN2'
),
cte_cp1257 (cp, lcid, collation) as (
select 1257, 0x00000425, N'Estonian_100_BIN2'
union all select 1257, 0x00000426, N'Latvian_100_BIN2'
union all select 1257, 0x00000427, N'Lithuanian_100_BIN2'
),
cte_cp1258 (cp, lcid, collation) as (
select 1258, 0x0000042a, N'Vietnamese_100_BIN2'
),
cte_cp_non1252 (cp, lcid, collation) as (
select cp, lcid, collation from cte_cp874
union all select cp, lcid, collation from cte_cp932
union all select cp, lcid, collation from cte_cp936
union all select cp, lcid, collation from cte_cp949
union all select cp, lcid, collation from cte_cp950
union all select cp, lcid, collation from cte_cp1250
union all select cp, lcid, collation from cte_cp1251
union all select cp, lcid, collation from cte_cp1253
union all select cp, lcid, collation from cte_cp1254
union all select cp, lcid, collation from cte_cp1255
union all select cp, lcid, collation from cte_cp1256
union all select cp, lcid, collation from cte_cp1257
union all select cp, lcid, collation from cte_cp1258
),
cte_cp1252 (cp, lcid, collation) as (
select distinct
-- TEST INTEGRITY ASSERTION:
-- If there's any code page that is not 1252 - cause a failure.
-- Use "divide by zero" as a distinctive error for this failure point.
iif(collationproperty(name, 'codepage') = 1252, 1252, 1/0)
, cast(collationproperty(name, 'lcid') as binary(4))
-- Doesn't matter which collation to pick as long as it's the only one for the given codepage/lcid combo.
, max (name) over (partition by collationproperty(name, 'codepage'), collationproperty(name, 'lcid'))
from fn_helpcollations() hc
where
collationproperty(name, 'codepage') not in (0, 65001)
and collationproperty(name, 'sortid') = 0
and not exists (
select * from cte_cp_non1252
where
cp = collationproperty(hc.name, 'codepage')
and lcid = cast(collationproperty(hc.name, 'lcid') as binary(4)))
),
cte_cp_all (cp, lcid, collation) as (
select cp, lcid, collation from cte_cp_non1252
union all select cp, lcid, collation from cte_cp1252
),
cte_sqltext (cp, lcid, collation, sqltext) as (
select
cte_cp_all.cp
, cte_cp_all.lcid
, cte_cp_all.collation
, N'select' + char(13) + char(10) +
N' N''' + cte_data.datasample + N''' as reference_data' + nchar(13) + nchar(10) +
N', cast(N''' + cte_data.datasample + N''' collate ' + cte_cp_all.collation + N' as varchar(1000)) as codepage_data'
from
cte_cp_all left join cte_data on (cte_cp_all.cp = cte_data.cp)
)
select cp, lcid, collation, sqltext from cte_sqltext
union all
select
-- TEST INTEGRITY ASSERTION:
-- Check if all codepages have a sample data associated with them.
-- The "sqltext" property will be set to NULL for those that do not.
-- Cause a failure if found.
-- Use "arithmetic overflow" as a distinctive error for this failure point
-- by casting the code page value (that is greater than 255) to tinyint.
cast(cp as tinyint), null, null, null
from cte_sqltext
where sqltext is null
union all
select
-- TEST INTEGRITY ASSERTION:
-- Check for "orphan" codepage/lcid combos that are present in SQL server
-- but not covered by the above CTEs. Exclude:
-- - SQL collations (the ones whose SortId is nonzero)
-- - Unicode-only collations (the ones that do not have a code page)
-- - UTF-8 collations - those are orthogonal to LCIDs and don't need to be tested.
-- If found such orphan combos - cause a failure.
-- Use "invalid cast" as a distinctive error for this failure point by
-- casting collation name (that is guaranteed to not be a valid number) to a number.
cast(name as int), null, null, null
from fn_helpcollations() hc
where
collationproperty(name, 'codepage') not in (0, 65001)
and collationproperty(name, 'sortid') = 0
and not exists (
select * from cte_cp_all where
cast(collationproperty(hc.name, 'lcid') as binary(4)) = cte_cp_all.lcid
and collationproperty(hc.name, 'codepage') = cte_cp_all.cp
)
`

// Represents codepage/LCID pair
type CpLcid struct {
cp int
lcid int32
}

// Represents a collation / comparison SQL text data
// for each given codepage/LCID pair.
type CpLcidComparisonData struct {
collation string
sqltext string
}

// Type alias for the mapping of codepage/LCID pair
// to its collation/sqltext data.
type CpLcidComparisonMap map[CpLcid]CpLcidComparisonData

// Builds a map of LCID fetching queries for all codepage/LCID pairs.
func buildLcidFetchComparisonMap(conn *sql.DB, t *testing.T) CpLcidComparisonMap {
stmt, err := conn.Prepare(comparisonQueriesGeneratorSQL)
if err != nil {
t.Error("Unable to run comparison queries generator query", err.Error())
}
defer stmt.Close()

rows, err := stmt.Query()
sergeytenmsft marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
t.Error("Query failed:", err.Error())
}
defer rows.Close()

result := make(CpLcidComparisonMap)

for rows.Next() {
var codepage int
var lcidRaw []byte = make([]byte, 4)
var collation string
var sqltext string

err := rows.Scan(&codepage, &lcidRaw, &collation, &sqltext)
if err != nil {
t.Error("Failed to fetch the comparison SQL text row:", err.Error())
}

var lcid int32
err = binary.Read(bytes.NewReader(lcidRaw), binary.BigEndian, &lcid)
shueybubbles marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
t.Error("Failed to convert LCID from binary to int:", err.Error())
}

cplcid := CpLcid{codepage, lcid}
cplciddata := CpLcidComparisonData{collation, sqltext}

result[cplcid] = cplciddata
}

err = rows.Err()
if err != nil {
t.Error("Rows containing comparison queries have errors", err)
}

return result
}

// Verifies a specific LCID fetch by comparing it to its reference data.
func verifyLcidFetch(conn *sql.DB, sqltext *string, t *testing.T) bool {
var refdata string
var cpdata string

err := conn.QueryRow(*sqltext).Scan(&refdata, &cpdata)
if err != nil {
t.Error("Cannot scan reference and codepage data", err)
}

return refdata == cpdata
}

// Tests the fetching of all available LCIDs to verify that they
// are being correctly mapped to their respective Windows code pages.
func TestLcidsFetching(t *testing.T) {
conn, _ := sql.Open("sqlserver", makeConnStr(t).String())
defer conn.Close()

cplcidmap := buildLcidFetchComparisonMap(conn, t)

success := true
for cplcid, cplciddata := range cplcidmap {
if !verifyLcidFetch(conn, &cplciddata.sqltext, t) {
success = false
t.Logf("LCID fetch failed for codepage %d, lcid 0x%x, collation %s",
cplcid.cp, cplcid.lcid, cplciddata.collation)
}
}

if !success {
t.Error("There are failed LCID fetches. See test log for the details.")
}
}