Tokenize String – Comma Delimiter, But Keep Quoted Words together ( Using PowerShell )

Background

Wanted a proof of concept code to see how difficult it really is to tokenize a sentence.

We will like to keep words enclosed in double quotes together.

 

Code

Powershell

 


Set-StrictMode -Version Latest

[string] $CHAR_COMMA = ',';

<#
	Tokenize String
#>
function tokenizeString([string] $sentence)
{
	
	#declare list of tokens as array of strings
	[string[]] $listToken = $null;
	
	#regular expression
	$regex =  ' (?=(?:[^"]|"[^"]*")*$)'
	
	#tokenize string
	$listToken = [regex]::Split( $sentence, $regex);
	
	#return tokens
	return($listToken); 
	
}

<#
	Strip control characters from each token
#>
function tokenStripControlChar([string[]]$listToken)
{
	
	#if array of strings is empty, exit function
	if ($listToken -eq $null)
	{
		return;
	}
	
	# get number of tokens
	$iNumberofTokens = $listToken.count;

	#pointer
	$i = 0;

	#iterate string array
	while ($i -lt $iNumberofTokens)
	{
		
		#strip Control Char - "";
		$listToken[$i] = $listToken[$i].Replace('"', '');
		
		#get current length
		$len = $listToken[$i].Length;
		
		#remove comma if is last character
		#assume it is been used as a delimeter
		if ( $listToken[$i].Substring($len-1,1) -eq $CHAR_COMMA)
		{
			$listToken[$i]= $listToken[$i].Substring(0, $len-1);
		}
				
		# get next token in list
		$i = $i + 1;
	}
	
}

<#
	list each token
#>
function tokenPrepareDisplay([string[]]$listToken)
{

	[object] $objArrayList = $null;
	[string] $formatBuffer = "`t {0} {1} `r`n";
	
	$objArrayList = New-Object -TypeName "System.Collections.ArrayList";
	
	if ($listToken -eq $null)
	{
		return;
	}
	
	$iNumberofTokens = $listToken.count;
	
	$i = 0;

	#Iterate list of tokens
	while ($i -lt $iNumberofTokens)
	{
		
		#1-based
		$iRelative = $i +1;
		
		#get contextual token
		$data = $listToken[$i];
		
		#format display buffer
		#in our case tab it out
		$log = $formatBuffer -f $iRelative, $data;
		
		#add formatted string to array
		$rc = $objArrayList.Add($log);
		
		#move to next element
		$i = $i + 1;
	}
	
	return ($objArrayList);
	
}

[string[]] $listToken = $null;
[object]   $objArrayList = $null;
[string]   $formatArrayElement = "{0}) {1}";
[int]      $i =0;
[int]      $iRelative =0;

$list = @(
	  '"Paris, France"'
	  
    , '"Milan, Italy", AC Milan'
	
	, '"White, Betty"'
	
	, 'White, Betty'
	
);

$iNumberofElements = $list.count;

$i=0;

while ($i -lt $iNumberofElements)
{
	
	#capture pointer for zero based to 1-based
	$iRelative = $i +1;
	
	#Get element from array
	$data = $list[$i];
	
	#format array element for display
	$log = $formatArrayElement -f $iRelative, $data;
	
	#display array element
	Write-Host $log;
	
	# tokenize string
	$listToken = tokenizeString $data ;
	
	#strip control characters
	tokenStripControlChar $listToken;
	
	#prepare tokens for display
	$objArrayList = tokenPrepareDisplay $listToken;
	
	#display tokens
	Write-Host $objArrayList;
	
	#display space
	Write-Host "";
	
	#Move to next element
	$i = $i + 1;
	
}

Code Sharing

GitHub

Gist

  1. DanielAdeniji/stringSplit.ps1
    Link

 

 

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s