Convert String to UTF-8 bytes

Convert String to UTF-8 bytes - string

How do I encode a String in Ceylon as UTF-8 bytes?
value string = "my_string";
[Byte*] bytes = string.______;

Use ceylon.buffer.charset.
import ceylon.buffer.charset {
utf8
}
shared void run() {
value string = "my_string";
List<Byte> bytes = utf8.encode(string);
Byte[] bytesSequence = bytes.sequence(); // in case a List isn’t enough
}
Try it!

Related

Tourble with HEX and Ascii encoding in node js [duplicate]

I'm trying to convert a unicode string to a hexadecimal representation in javascript.
This is what I have:
function convertFromHex(hex) {
var hex = hex.toString();//force conversion
var str = '';
for (var i = 0; i < hex.length; i += 2)
str += String.fromCharCode(parseInt(hex.substr(i, 2), 16));
return str;
}
function convertToHex(str) {
var hex = '';
for(var i=0;i<str.length;i++) {
hex += ''+str.charCodeAt(i).toString(16);
}
return hex;
}
But if fails on unicode characters, like chinese;
Input:
漢字
Output:
ªo"[W
Any ideas? Can this be done in javascript?

Remember that a JavaScript code unit is 16 bits wide. Therefore the hex string form will be 4 digits per code unit.
usage:
var str = "\u6f22\u5b57"; // "\u6f22\u5b57" === "漢字"
alert(str.hexEncode().hexDecode());
String to hex form:
String.prototype.hexEncode = function(){
var hex, i;
var result = "";
for (i=0; i<this.length; i++) {
hex = this.charCodeAt(i).toString(16);
result += ("000"+hex).slice(-4);
}
return result
}
Back again:
String.prototype.hexDecode = function(){
var j;
var hexes = this.match(/.{1,4}/g) || [];
var back = "";
for(j = 0; j<hexes.length; j++) {
back += String.fromCharCode(parseInt(hexes[j], 16));
}
return back;
}

Here is a tweak of McDowell's algorithm that doesn't pad the result:
function toHex(str) {
var result = '';
for (var i=0; i<str.length; i++) {
result += str.charCodeAt(i).toString(16);
}
return result;
}

A more up to date solution, for encoding:
// This is the same for all of the below, and
// you probably won't need it except for debugging
// in most cases.
function bytesToHex(bytes) {
return Array.from(
bytes,
byte => byte.toString(16).padStart(2, "0")
).join("");
}
// You almost certainly want UTF-8, which is
// now natively supported:
function stringToUTF8Bytes(string) {
return new TextEncoder().encode(string);
}
// But you might want UTF-16 for some reason.
// .charCodeAt(index) will return the underlying
// UTF-16 code-units (not code-points!), so you
// just need to format them in whichever endian order you want.
function stringToUTF16Bytes(string, littleEndian) {
const bytes = new Uint8Array(string.length * 2);
// Using DataView is the only way to get a specific
// endianness.
const view = new DataView(bytes.buffer);
for (let i = 0; i != string.length; i++) {
view.setUint16(i, string.charCodeAt(i), littleEndian);
}
return bytes;
}
// And you might want UTF-32 in even weirder cases.
// Fortunately, iterating a string gives the code
// points, which are identical to the UTF-32 encoding,
// though you still have the endianess issue.
function stringToUTF32Bytes(string, littleEndian) {
const codepoints = Array.from(string, c => c.codePointAt(0));
const bytes = new Uint8Array(codepoints.length * 4);
// Using DataView is the only way to get a specific
// endianness.
const view = new DataView(bytes.buffer);
for (let i = 0; i != codepoints.length; i++) {
view.setUint32(i, codepoints[i], littleEndian);
}
return bytes;
}
Examples:
bytesToHex(stringToUTF8Bytes("hello 漢字 👍"))
// "68656c6c6f20e6bca2e5ad9720f09f918d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", false))
// "00680065006c006c006f00206f225b570020d83ddc4d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", true))
// "680065006c006c006f002000226f575b20003dd84ddc"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", false))
// "00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", true))
// "68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"
For decoding, it's generally a lot simpler, you just need:
function hexToBytes(hex) {
const bytes = new Uint8Array(hex.length / 2);
for (let i = 0; i !== bytes.length; i++) {
bytes[i] = parseInt(hex.substr(i * 2, 2), 16);
}
return bytes;
}
then use the encoding parameter of TextDecoder:
// UTF-8 is default
new TextDecoder().decode(hexToBytes("68656c6c6f20e6bca2e5ad9720f09f918d"));
// but you can also use:
new TextDecoder("UTF-16LE").decode(hexToBytes("680065006c006c006f002000226f575b20003dd84ddc"))
new TextDecoder("UTF-16BE").decode(hexToBytes("00680065006c006c006f00206f225b570020d83ddc4d"));
// "hello 漢字 👍"
Here's the list of allowed encoding names: https://www.w3.org/TR/encoding/#names-and-labels
You might notice UTF-32 is not on that list, which is a pain, so:
function bytesToStringUTF32(bytes, littleEndian) {
const view = new DataView(bytes.buffer);
const codepoints = new Uint32Array(view.byteLength / 4);
for (let i = 0; i !== codepoints.length; i++) {
codepoints[i] = view.getUint32(i * 4, littleEndian);
}
return String.fromCodePoint(...codepoints);
}
Then:
bytesToStringUTF32(hexToBytes("00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"), false)
bytesToStringUTF32(hexToBytes("68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"), true)
// "hello 漢字 👍"

It depends on what encoding you use. If you want to convert utf-8 encoded hex to string, use this:
function fromHex(hex,str){
try{
str = decodeURIComponent(hex.replace(/(..)/g,'%$1'))
}
catch(e){
str = hex
console.log('invalid hex input: ' + hex)
}
return str
}
For the other direction use this:
function toHex(str,hex){
try{
hex = unescape(encodeURIComponent(str))
.split('').map(function(v){
return v.charCodeAt(0).toString(16)
}).join('')
}
catch(e){
hex = str
console.log('invalid text input: ' + str)
}
return hex
}

how do you get "\u6f22\u5b57" from 漢字 in JavaScript?
These are JavaScript Unicode escape sequences e.g. \u12AB. To convert them, you could iterate over every code unit in the string, call .toString(16) on it, and go from there.
However, it is more efficient to also use hexadecimal escape sequences e.g. \xAA in the output wherever possible.
Also note that ASCII symbols such as A, b, and - probably don’t need to be escaped.
I’ve written a small JavaScript library that does all this for you, called jsesc. It has lots of options to control the output.
Here’s an online demo of the tool in action: http://mothereff.in/js-escapes#1%E6%BC%A2%E5%AD%97
Your question was tagged as utf-8. Reading the rest of your question, UTF-8 encoding/decoding didn’t seem to be what you wanted here, but in case you ever need it: use utf8.js (online demo).

Here you go. :D
"漢字".split("").reduce((hex,c)=>hex+=c.charCodeAt(0).toString(16).padStart(4,"0"),"")
"6f225b57"
for non unicode
"hi".split("").reduce((hex,c)=>hex+=c.charCodeAt(0).toString(16).padStart(2,"0"),"")
"6869"
ASCII (utf-8) binary HEX string to string
"68656c6c6f20776f726c6421".match(/.{1,2}/g).reduce((acc,char)=>acc+String.fromCharCode(parseInt(char, 16)),"")
String to ASCII (utf-8) binary HEX string
"hello world!".split("").reduce((hex,c)=>hex+=c.charCodeAt(0).toString(16).padStart(2,"0"),"")
--- unicode ---
String to UNICODE (utf-16) binary HEX string
"hello world!".split("").reduce((hex,c)=>hex+=c.charCodeAt(0).toString(16).padStart(4,"0"),"")
UNICODE (utf-16) binary HEX string to string
"00680065006c006c006f00200077006f0072006c00640021".match(/.{1,4}/g).reduce((acc,char)=>acc+String.fromCharCode(parseInt(char, 16)),"")

Here is my take: these functions convert a UTF8 string to a proper HEX without the extra zeroes padding. A real UTF8 string has characters with 1, 2, 3 and 4 bytes length.
While working on this I found a couple key things that solved my problems:
str.split('') doesn't handle multi-byte characters like emojis correctly. The proper/modern way to handle this is with Array.from(str)
encodeURIComponent() and decodeURIComponent() are great tools to convert between string and hex. They are pretty standard, they handle UTF8 correctly.
(Most) ASCII characters (codes 0 - 127) don't get URI encoded, so they need to handled separately. But c.charCodeAt(0).toString(16) works perfectly for those
function utf8ToHex(str) {
return Array.from(str).map(c =>
c.charCodeAt(0) < 128 ? c.charCodeAt(0).toString(16) :
encodeURIComponent(c).replace(/\%/g,'').toLowerCase()
).join('');
},
function hexToUtf8: function(hex) {
return decodeURIComponent('%' + hex.match(/.{1,2}/g).join('%'));
}
Demo: https://jsfiddle.net/lyquix/k2tjbrvq/

UTF-8 Supported Convertion
Decode
function utf8ToHex(str) {
return Array.from(str).map(c =>
c.charCodeAt(0) < 128 ? c.charCodeAt(0).toString(16) :
encodeURIComponent(c).replace(/\%/g,'').toLowerCase()
).join('');
}
Encode
function hexToUtf8(hex) {
return decodeURIComponent('%' + hex.match(/.{1,2}/g).join('%'));
}

Convert Properties object to byte array in java

I want to convert Properties object to byte[], however i can do with the following piece of code but
private byte[] getBytes(Properties properties){
StringWriter stringWriter = new StringWriter();
PrintWriter printWriter=new PrintWriter(stringWriter);
properties.list(printWriter);
String fileContent = stringWriter.getBuffer().toString();
byte[] bytes = fileContent.getBytes();
try{
stringWriter.close();
printWriter.close();
}catch (IOException e){
log.error("unable to close resource stringWriter" + e.getStackTrace());
}
return bytes;
}
but properties.list(printWriter), will print the string "--listing properties--" string to the console. Need help in finding the best way to do it.

I used a ByteArrayOutputStream to convert a Properties object. Your function could be modified to be the following -
private byte[] getBytes(Properties properties){
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try {
props.store(byteArrayOutputStream, "");
} catch (IOException e) {
log.error("An error occurred while storing properties to a byte array: " + e.getStackTrace());
}
return byteArrayOutputStream.toByteArray();
}

Properties file contains plain text, so to store a byte array you need to encode bytes to plain text. The best way is to use Base64 encodec.
String Base64.econdeToString(byte[])
And to retrieve bytes:
byte[] Base64.decode(String)

MD5 in Java Micro Edition

I have a problem.
I must get a md5 hash of string in Java ME.
I have that code
public static String md5(String input) throws UnsupportedEncodingException{
String res = "";
try {
MessageDigest algorithm = MessageDigest.getInstance("MD5");
algorithm.reset();
algorithm.update(input.getBytes("UTF-8"));
byte[] md5 = algorithm.digest();
return md5.toString();
}
catch (NoSuchAlgorithmException ex) {}
return res;
}
But MessageDigest.update() and MessageDigest.digest() accept only 3 arguments.
Any ideas?

The two other arguments are offset and len, which you can set to 0 and the length of the byte buffer respectively.

limiting the number of character in GUID

is it possible or is there any overload to get a less than 32 characters of GUID ?
currently i am using this statement but its giving me error
string guid = new Guid("{dddd-dddd-dddd-dddd}").ToString();
i want a key of 20 characters

You can use a ShortGuid. Here is an example of an implementation.
It's nice to use ShortGuids in URLs or other places visible to an end user.
The following code:
Guid guid = Guid.NewGuid();
ShortGuid sguid1 = guid; // implicitly cast the guid as a shortguid
Console.WriteLine( sguid1 );
Console.WriteLine( sguid1.Guid );
Will give you this output:
FEx1sZbSD0ugmgMAF_RGHw
b1754c14-d296-4b0f-a09a-030017f4461f
This is the code for an Encode and Decode method:
public static string Encode(Guid guid)
{
string encoded = Convert.ToBase64String(guid.ToByteArray());
encoded = encoded
.Replace("/", "_")
.Replace("+", "-");
return encoded.Substring(0, 22);
}
public static Guid Decode(string value)
{
value = value
.Replace("_", "/")
.Replace("-", "+");
byte[] buffer = Convert.FromBase64String(value + "==");
return new Guid(buffer);
}

Use Jeff Attwood's ASCII85...
http://www.codinghorror.com/blog/2005/10/equipping-our-ascii-armor.html
and
http://www.codinghorror.com/blog/2005/10/c-implementation-of-ascii85.html

J2ME AES Decryption Error(org.bouncycastle.crypto.InvalidCipherTextException: pad block corrupted)

I am doing encryption and decryption using AES Algorithm with bouncy castle
My encryption and decryption works ok but it gives me error when my plain text size is bigger
even sometimes it is giving non decrypted data
public static boolean setEncryptionKey(String keyText)
{
byte[] keyBytes = keyText.getBytes();
key = new KeyParameter(keyBytes);
engine = new AESFastEngine();
cipher = new PaddedBufferedBlockCipher(engine);
return true;
}
Encryption:
public static String encryptString(String plainText)
{
byte[] plainArray = plainText.getBytes();
cipher.init(true, key);
byte[] cipherBytes = new byte[cipher.getOutputSize(plainArray.length)];
int cipherLength = cipher.processBytes(plainArray, 0, plainArray.length, cipherBytes, 0);
cipher.doFinal(cipherBytes, cipherLength);
String cipherString = new String(cipherBytes);
return cipherString;
}
Decryption:
public static String decryptString(String encryptedText)
{
byte[] cipherBytes = encryptedText.getBytes();
cipher.init(false, key);
byte[] decryptedBytes = new byte[cipher.getOutputSize(cipherBytes.length)];
int decryptedLength = cipher.processBytes(cipherBytes, 0, cipherBytes.length, decryptedBytes, 0);
cipher.doFinal(decryptedBytes, decryptedLength);
String decryptedString = new String(decryptedBytes);
int index = decryptedString.indexOf("\u0000");
if (index >= 0)
{
decryptedString = decryptedString.substring(0, index);
}
return decryptedString;
}
This decryption is giving me following error
org.bouncycastle.crypto.InvalidCipherTextException: pad block corrupted
at org.bouncycastle.crypto.paddings.PKCS7Padding.padCount(+30)
at org.bouncycastle.crypto.paddings.PaddedBufferedBlockCipher.doFinal(+190)
at com.NewCrypto.decryptString(NewCrypto.java:103)
at com.New_Midlet.startApp(New_Midlet.java:23)
at javax.microedition.midlet.MIDletProxy.startApp(MIDletProxy.java:44)
at com.sun.midp.midlet.Scheduler.schedule(Scheduler.java:375)
at com.sun.midp.main.Main.runLocalClass(Main.java:477)
at com.sun.midp.main.Main.main(+80)
what could be the problem ?

The line
String cipherString = new String(cipherBytes);
is a bug. cipherBytes is a byte array with arbitrary values and cannot be converted to a string using any of the Java string decoders. You should just send/save the cipher as a byte array. If you must make it a string then you'll have to use an encoder. Base64 encoders are often used, as are Base16 (hex). You can use the Apache Commons Codec or my favorite, the Harder Base64 codec.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Convert String to UTF-8 bytes - string

How do I encode a String in Ceylon as UTF-8 bytes? value string = "my_string"; [Byte*] bytes = string.______;

Use ceylon.buffer.charset. import ceylon.buffer.charset { utf8 } shared void run() { value string = "my_string"; List<Byte> bytes = utf8.encode(string); Byte[] bytesSequence = bytes.sequence(); // in case a List isn’t enough } Try it!

Related

Tourble with HEX and Ascii encoding in node js [duplicate]

Convert Properties object to byte array in java

MD5 in Java Micro Edition

limiting the number of character in GUID

J2ME AES Decryption Error(org.bouncycastle.crypto.InvalidCipherTextException: pad block corrupted)

Categories

Resources