2

I am receiving a data from a Google Language Translator service and need help splitting the data.

void Start()
{
    translateText("Hello, This is a test!", "en", "fr");
}

void translateText(string text, string fromLanguage, string toLanguage)
{
    string url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=" + fromLanguage + "&tl=" + toLanguage + "&dt=t&q=" + Uri.EscapeUriString(text);
    StartCoroutine(startTranslator(url));
}

IEnumerator startTranslator(string url)
{
    UnityWebRequest www = UnityWebRequest.Get(url);
    yield return www.Send();

    Debug.Log("Raw string Received: " + www.downloadHandler.text);

    LanguageResult tempResult = decodeResult(www.downloadHandler.text);

    Debug.Log("Original Text: " + tempResult.originalText);
    Debug.Log("Translated Text: " + tempResult.translatedText);
    Debug.Log("LanguageIso: " + tempResult.languageIso);

    yield return null;
}

LanguageResult decodeResult(string result)
{
    char[] delims = { '[', '\"', ']', ',' };
    string[] arr = result.Split(delims, StringSplitOptions.RemoveEmptyEntries);

    LanguageResult tempLang = null;
    if (arr.Length >= 4)
    {
        tempLang = new LanguageResult();
        tempLang.translatedText = arr[0];
        tempLang.originalText = arr[1];
        tempLang.unknowValue = arr[2];
        tempLang.languageIso = arr[3];
    }
    return tempLang;
}

public class LanguageResult
{
    public string translatedText;
    public string originalText;
    public string unknowValue;
    public string languageIso;
}

then calling it with translateText("Hello, This is a test!", "en", "fr"); from the Start() function which converts the English sentence to French with ISO 639-1 Code.

The received data looks like this:

[[["Bonjour, Ceci est un test!","Hello, This is a test!",,,0]],,"en"]

I want to split it like this:

  • Bonjour, Ceci est un test!
  • Hello, This is a test!
  • 0
  • en

and put them into a string array in order.

I currently use this:

char[] delims = { '[', '\"', ']', ',' };
        string[] arr = result.Split(delims, StringSplitOptions.RemoveEmptyEntries);

This works if there is no comma in the received string. If there is a comma, the splitted values are messed up. What's the best way of splitting this?

EDIT:

With Blorgbeard's solution, the final working code is as below. Hopefully, this will help somebody else. This shouldn't be used for commercial purposes but for personal or school project.

void Start()
{
    //translateText("Hello, This is \" / \\ a test !", "en", "fr");
    //translateText("Hello, This is , \\ \" a test !", "en", "fr");
    translateText("Hello, This is a test!", "en", "fr");
}

void translateText(string text, string fromLanguage, string toLanguage)
{
    string url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=" + fromLanguage + "&tl=" + toLanguage + "&dt=t&q=" + Uri.EscapeUriString(text);
    StartCoroutine(startTranslator(url));
}

IEnumerator startTranslator(string url)
{
    UnityWebRequest www = UnityWebRequest.Get(url);
    yield return www.Send();

    Debug.Log("Raw string Received: " + www.downloadHandler.text);

    LanguageResult tempResult = decodeResult(www.downloadHandler.text);
    displayResult(tempResult);
    yield return null;
}

void displayResult(LanguageResult translationResult)
{
    Debug.Log("Original Text: " + translationResult.originalText);
    Debug.Log("Translated Text: " + translationResult.translatedText);
    Debug.Log("LanguageIso: " + translationResult.languageIso);
}

LanguageResult decodeResult(string result)
{
    string[] arr = Decode(result);

    LanguageResult tempLang = null;
    if (arr.Length >= 4)
    {
        tempLang = new LanguageResult();
        tempLang.translatedText = arr[0];
        tempLang.originalText = arr[1];
        tempLang.unknowValue = arr[2];
        tempLang.languageIso = arr[3];
    }
    return tempLang;
}

public class LanguageResult
{
    public string translatedText;
    public string originalText;
    public string unknowValue;
    public string languageIso;
}

private string[] Decode(string input)
{
    List<string> finalResult = new List<string>();

    bool inToken = false;
    bool inString = false;
    bool escaped = false;
    var seps = ",[]\"".ToArray();
    var current = "";
    foreach (var chr in input)
    {
        if (!inString && chr == '"')
        {
            current = "";
            inString = true;
            continue;
        }
        if (inString && !escaped && chr == '"')
        {
            finalResult.Add(current);
            current = "";
            inString = false;
            continue;
        }
        if (inString && !escaped && chr == '\\')
        {
            escaped = true;
            continue;
        }
        if (inString && (chr != '"' || escaped))
        {
            escaped = false;
            current += chr;
            continue;
        }
        if (inToken && seps.Contains(chr))
        {
            finalResult.Add(current);
            current = "";
            inToken = false;
            continue;
        }
        if (!inString && chr == '"')
        {
            inString = true;
            current = "";
            continue;
        }
        if (!inToken && !seps.Contains(chr))
        {
            inToken = true;
            current = "";
        }
        current += chr;
    }
    return finalResult.ToArray();
}
11
  • Can the strings also have (escaped) quotes in them? Commented Aug 24, 2016 at 21:30
  • Regex.Split might be the way to go here, then you could specifically disregard , followed by a space, etc. Commented Aug 24, 2016 at 21:30
  • @Blorgbeard I just checked, yes it can have \" in it. This is so tricky to me. Commented Aug 24, 2016 at 21:33
  • @l'L'l It would be good if you provide an example. Commented Aug 24, 2016 at 21:34
  • It's almost JSON, you may be able to find a JSON parser that deals with empty array elements. Commented Aug 24, 2016 at 21:35

4 Answers 4

4

You could code up a simple parser yourself. Here's one I threw together (could use some cleaning up, but demonstrates the idea):

private static IEnumerable<string> Parse(string input) {
    bool inToken = false;
    bool inString = false;
    bool escaped = false;
    var seps = ",[]\"".ToArray();
    var current = "";
    foreach (var chr in input) {
        if (!inString && chr == '"') {
            current = "";
            inString = true;
            continue;
        }
        if (inString && !escaped && chr == '"') {
            yield return current;
            current = "";
            inString = false;
            continue;
        }
        if (inString && !escaped && chr == '\\') {
            escaped = true;
            continue;
        }
        if (inString && (chr != '"' || escaped)) {
            escaped = false;
            current += chr;
            continue;
        }
        if (inToken && seps.Contains(chr)) {
            yield return current;
            current = "";
            inToken = false;
            continue;
        }
        if (!inString && chr == '"') {
            inString = true;
            current = "";
            continue;
        }
        if (!inToken && !seps.Contains(chr)) {
            inToken = true;
            current = "";
        }
        current += chr;
    }
}

Here's a jsfiddle demo.

Sign up to request clarification or add additional context in comments.

6 Comments

Nice out of the box thinking here +1
Is it possible to make this into a function that returns string array of 4 that contains those result? So instead of private static IEnumerable<string> Parse(string input), it becomes private string [] Parse(string input)?
Sure, just wrap it and call .ToArray() - or you could add to a List<string> instead of yield returning values, and then return the list .ToArray()ed at the end.
I did something like this but that didn't work. Can you spot the problem?
Yes, finalResult.Add(current); should be in the places where I had yield return current, not just at the bottom.
|
1

Using Regex.Split you could do something like this for example:

using System;
using System.Text.RegularExpressions;

public class Example
{
   public static void Main()
   {
        var input ="[[[\"Bonjour, Ceci est un test!\",\"Hello, This is a test!\",,,0]],,\"en\"]";
        var parse = Regex.Split(input, "\\[|\\]|[^a-zA-Z ],|\",\"|\"|\"");
        foreach(var item in parse) {
            bool result = !String.IsNullOrEmpty(item) && (Char.IsLetter(item[0]) || Char.IsDigit(item[0]));
            if (result) {
                Console.WriteLine(item);
            }
        }
   }
}

Output:

Bonjour, Ceci est un test!
Hello, This is a test!
0
en

If you want everything that was split you can simply remove the bool check for alphacharacters.

10 Comments

Wow this is so close. The only problem is that it is skipping the 0. Please take a look at the question again. You will see which 0.
Still the-same thing I got in your original answer.
@Programmer: See edit ( online example : dotnetfiddle.net/00jsNa )
I don't why but I am getting the-same result without the 0.
You're welcome! Regex sometimes takes a lot of tinkering with more complex scenarios like you have, but at any rate glad you found a solution.
|
1

Here is a crazy idea - split by " and then by the rest (but won't work if there is " between the "'s)

var s = @"[[[""Bonjour, Ceci est un test!"",""Hello, This is a test!"",,,0]],,""en""]";

var a = s.Split('"').Select((x, i) => (i & 1) > 0 ? new[] { x } : x.Split("[],".ToArray(),  
                     StringSplitOptions.RemoveEmptyEntries)).SelectMany(x => x).ToArray();

Debug.Print(string.Join("|", a)); // "Bonjour, Ceci est un test!|Hello, This is a test!|0|en"

1 Comment

Works but fails when there is an escaped quote in it. Thanks for your answer.
0

You can try regex for splitting. I tested with the sample you provided. It results like this.

    var str="[[[\"Bonjour, Ceci est un test!\",\"Hello, This is a test!\",,,0]],,\"en\"]";
    var splitted=Regex.Split(str,@"\[|\]|\,");
    foreach(var split in splitted){
       Console.WriteLine(split );
    }

   "Bonjour Ceci est un test!"
   "Hello This is a test!"
    0
   "en"

1 Comment

This did not work and the length is 15 instead of 4. I got "Bonjour in array[3].

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.