11 version = "unstable-2015-08-21";
13 src = fetchFromGitHub {
16 rev = "b56fa78a2fe44ac2851bae5bf4f4693a0644da7b";
17 hash = "sha256-YhXs45IbriKWKULguZM4DgfV/Fzr73VHxA1pFTXCyv8=";
22 name = "add-cmakelists.txt";
23 url = "https://github.com/CLD2Owners/cld2/pull/65/commits/9cfac02c2ac7802ab7079560b38a474473c45f51.patch";
24 hash = "sha256-uOjmUk8kMFl+wED44ErXoLRyblhgDwFx9K1Wj65Omh8=";
28 nativeBuildInputs = [ cmake ];
31 homepage = "https://github.com/CLD2Owners/cld2";
32 description = "Compact Language Detector 2";
34 CLD2 probabilistically detects over 80 languages in Unicode UTF-8 text,
35 either plain text or HTML/XML. Legacy encodings must be converted to valid
36 UTF-8 by the caller. For mixed-language input, CLD2 returns the top three
37 languages found and their approximate percentages of the total text bytes
38 (e.g. 80% English and 20% French out of 1000 bytes of text means about 800
39 bytes of English and 200 bytes of French). Optionally, it also returns a
40 vector of text spans with the language of each identified. This may be
41 useful for applying different spelling-correction dictionaries or
42 different machine translation requests to each span. The design target is
43 web pages of at least 200 characters (about two sentences); CLD2 is not
44 designed to do well on very short text, lists of proper names, part
47 license = licenses.asl20;
48 maintainers = with maintainers; [ chvp ];
49 platforms = platforms.all;