Link Search Menu Expand Document

Collation Examples

Contents

  1. Simple Collation Sample Customization
    1. Main Function
  2. Language-sensitive searching
  3. Using large buffers to manage sort keys

Simple Collation Sample Customization

The following program demonstrates how to compare and create sort keys with default locale.

In C:

#include <stdio.h>
#include <memory.h>
#include <string.h>
#include "unicode/ustring.h"
#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/ucol.h"

#define MAXBUFFERSIZE 100
#define BIGBUFFERSIZE 5000

UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
{
    UChar         dispName    [MAXBUFFERSIZE]; 
    int32_t       bufferLen   = 0;
    UChar         source            [MAXBUFFERSIZE];
    UChar         target            [MAXBUFFERSIZE];
    UCollationResult result   = UCOL_EQUAL;
    uint8_t             sourceKeyArray    [MAXBUFFERSIZE];
    uint8_t             targetKeyArray    [MAXBUFFERSIZE]; 
    int32_t       sourceKeyOut      = 0, 
                targetKeyOut = 0;
    UCollator     *myCollator = 0;
    if (U_FAILURE(*status))
    {
        return FALSE;
    }
    u_uastrcpy(source, "This is a test.");
    u_uastrcpy(target, "THIS IS A TEST.");
    myCollator = ucol_open(locale, status);
    if (U_FAILURE(*status)){
        bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status);
        /*Report the error with display name... */
        fprintf(stderr,
        "Failed to create the collator for : \"%s\"\n", dispName);
        return FALSE;
    }
    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
    /* result is 1, secondary differences only for ignorable space characters*/
    if (result != UCOL_LESS)
    {
        fprintf(stderr,
        "Comparing two strings with only secondary differences in C failed.\n");
        return FALSE;
    }
    /* To compare them with just primary differences */
    ucol_setStrength(myCollator, UCOL_PRIMARY);
    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
    /* result is 0 */
    if (result != 0)
    {
        fprintf(stderr,
        "Comparing two strings with no differences in C failed.\n");
        return FALSE;
    }

    /* Now, do the same comparison with keys */
    sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE);
    targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE);
    result = 0;
    result = strcmp(sourceKeyArray, targetKeyArray);
    if (result != 0)
    {
        fprintf(stderr,
        "Comparing two strings with sort keys in C failed.\n");
        return FALSE;
    }
    ucol_close(myCollator);
    return TRUE;
}

In C++:

#include <stdio.h>
#include "unicode/unistr.h"
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/sortkey.h"
UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
{
    UnicodeString dispName; 
    UnicodeString source("This is a test.");
    UnicodeString target("THIS IS A TEST.");
    Collator::EComparisonResult result    = Collator::EQUAL;
    CollationKey sourceKey;
    CollationKey targetKey; 
    Collator      *myCollator = 0;
    if (U_FAILURE(status))
    {
        return FALSE;
    }
    myCollator = Collator::createInstance(locale, status);
    if (U_FAILURE(status)){
        locale.getDisplayName(dispName);
        /*Report the error with display name... */
        fprintf(stderr,
        "%s: Failed to create the collator for : \"%s\"\n", dispName);
        return FALSE;
    }
    result = myCollator->compare(source, target);
    /* result is 1, secondary differences only for ignorable space characters*/
    if (result != UCOL_LESS)
    {
        fprintf(stderr,
        "Comparing two strings with only secondary differences in C failed.\n");
        return FALSE;
    }
    /* To compare them with just primary differences */
    myCollator->setStrength(Collator::PRIMARY);
    result = myCollator->compare(source, target);
    /* result is 0 */
    if (result != 0)
    {
        fprintf(stderr,
        "Comparing two strings with no differences in C failed.\n");
        return FALSE;
    }
    /* Now, do the same comparison with keys */
    myCollator->getCollationKey(source, sourceKey, status);
    myCollator->getCollationKey(target, targetKey, status);
    result = Collator::EQUAL;

    result = sourceKey.compareTo(targetKey);
    if (result != 0)
    {
        fprintf(stderr,
        "%s: Comparing two strings with sort keys in C failed.\n");
        return FALSE;
    }
    delete myCollator;
    return TRUE;
}

Main Function

extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
int main()
{
   UErrorCode status = U_ZERO_ERROR;
   fprintf(stdout, "\n");
   if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE)
   {
        fprintf(stderr,
        "Collate with locale in C++ failed.\n");
   } else 
   {
       fprintf(stdout, "Collate with Locale C++ example worked!!\n");
   }
   status = U_ZERO_ERROR;
   fprintf(stdout, "\n");
   if (collateWithLocaleInC("en_US", &status) != TRUE)
   {
        fprintf(stderr,
        "%s: Collate with locale in C failed.\n");
   } else 
   {
       fprintf(stdout, "Collate with Locale C example worked!!\n");
   }
   return 0;
}

In Java:

import com.ibm.icu.text.Collator;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.CollationKey;
import java.util.Locale;

public class CollateExample
{

    public static void main(String arg[]) 
    {
        CollateExample example = new CollateExample();
        try {
            if (!example.collateWithLocale(Locale.US)) {
                System.err.println("Collate with locale example failed.");
            } 
            else {
                System.out.println("Collate with Locale example worked!!");
            }
        } catch (Exception e) {
            System.err.println("Collating with locale failed");
            e.printStackTrace();
        }
    }

    public boolean collateWithLocale(Locale locale) throws Exception
    {
        String source = "This is a test.";
        String target = "THIS IS A TEST.";
        Collator myCollator = Collator.getInstance(locale);

        int result = myCollator.compare(source, target);
        // result is 1, secondary differences only for ignorable space characters
        if (result >= 0) {
            System.err.println(
                "Comparing two strings with only secondary differences failed.");
            return false;
        }
        // To compare them with just primary differences
        myCollator.setStrength(Collator.PRIMARY);
        result = myCollator.compare(source, target);
        // result is 0
        if (result != 0) {
            System.err.println(
                           "Comparing two strings with no differences failed.");
            return false;
        }
        // Now, do the same comparison with keys
        CollationKey sourceKey = myCollator.getCollationKey(source);
        CollationKey targetKey = myCollator.getCollationKey(target);
        result = sourceKey.compareTo(targetKey);
        if (result != 0) {
            System.err.println("Comparing two strings with sort keys failed.");
            return false;
        }
        return true;
    }   
}

Language-sensitive searching

String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For a full description of this concept, please see Laura Werner’s text searching article for more details (http://icu-project.org/docs/papers/efficient_text_searching_in_java.html).

However, implementing collation-based search with the Boyer-Moore method while getting correct results is very tricky, and ICU no longer uses this method.

Please see the String Search Service chapter.

Using large buffers to manage sort keys

A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer.

Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields.

uint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys,
                            uint32_t sourceSize, uint8_t **buffer,
                            uint32_t *maxSize, UErrorCode *status) 
{
  if(status == NULL || U_FAILURE(*status)) {
    return 0;
  }

  uint32_t bufferSize = 16384;
  uint32_t increment = 16384;
  uint32_t currentOffset = 0;
  uint32_t keySize = 0;
  uint32_t i = 0;
  *maxSize = 0;

  *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));
  if(buffer == NULL) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return 0;
  }

  for(i = 0; i < sourceSize; i++) {
    keys[i] = currentOffset;
    keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
    if(keySize > bufferSize-currentOffset) {
      *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment);
      if(buffer == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return 0;
      }
      bufferSize += increment;
      keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
    }
    /* here you can hook code that does something interesting with the keySize - 
     * remembers the maximum or similar...
     */
    if(keySize > *maxSize) {
      *maxSize = keySize;
    }
    currentOffset += keySize;
  }

  return currentOffset;
}