Here is my small contribution. A method to replace within the same string without creating new strings or stringbuilders. It's fast, easy to understand and a good alternative to all mentions in this post.
private static HashSet<char> _invalidCharsHash;
private static HashSet<char> InvalidCharsHash
{
get { return _invalidCharsHash ?? (_invalidCharsHash = new HashSet<char>(Path.GetInvalidFileNameChars())); }
}
private static string ReplaceInvalidChars(string fileName, string newValue)
{
char newChar = newValue[0];
char[] chars = fileName.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
char c = chars[i];
if (InvalidCharsHash.Contains(c))
chars[i] = newChar;
}
return new string(chars);
}
You can call it like this:
string illegal = "\"M<>\"\\a/ry/ h**ad:>> a\\/:*?\"<>| li*tt|le|| la\"mb.?";
string legal = ReplaceInvalidChars(illegal);
and returns:
_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
It's worth to note that this method will always replace invalid chars with a given value, but will not remove them. If you want to remove invalid chars, this alternative will do the trick:
private static string RemoveInvalidChars(string fileName, string newValue)
{
char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
bool remove = newChar == char.MinValue;
char[] chars = fileName.ToCharArray();
char[] newChars = new char[chars.Length];
int i2 = 0;
for (int i = 0; i < chars.Length; i++)
{
char c = chars[i];
if (InvalidCharsHash.Contains(c))
{
if (!remove)
newChars[i2++] = newChar;
}
else
newChars[i2++] = c;
}
return new string(newChars, 0, i2);
}
BENCHMARK
I executed timed test runs with most methods found in this post, if performance is what you are after. Some of these methods don't replace with a given char, since OP was asking to clean the string. I added tests replacing with a given char, and some others replacing with an empty char if your intended scenario only needs to remove the unwanted chars. Code used for this benchmark is at the end, so you can run your own tests.
Note: Methods Test1
and Test2
are both proposed in this post.
First Run
replacing with '_', 1000000 iterations
Results:
============Test1===============
Elapsed=00:00:01.6665595
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test2===============
Elapsed=00:00:01.7526835
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test3===============
Elapsed=00:00:05.2306227
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test4===============
Elapsed=00:00:14.8203696
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test5===============
Elapsed=00:00:01.8273760
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test6===============
Elapsed=00:00:05.4249985
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test7===============
Elapsed=00:00:07.5653833
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test8===============
Elapsed=00:12:23.1410106
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test9===============
Elapsed=00:00:02.1016708
Result=_M ____a_ry_ h__ad___ a_________ li_tt_le__ la_mb._
============Test10===============
Elapsed=00:00:05.0987225
Result=M ary had a little lamb.
============Test11===============
Elapsed=00:00:06.8004289
Result=M ary had a little lamb.
Second Run
removing invalid chars, 1000000 iterations
Note: Test1 will not remove, only replace.
Results:
============Test1===============
Elapsed=00:00:01.6945352
Result= M a ry h ad a li tt le la mb.
============Test2===============
Elapsed=00:00:01.4798049
Result=M ary had a little lamb.
============Test3===============
Elapsed=00:00:04.0415688
Result=M ary had a little lamb.
============Test4===============
Elapsed=00:00:14.3397960
Result=M ary had a little lamb.
============Test5===============
Elapsed=00:00:01.6782505
Result=M ary had a little lamb.
============Test6===============
Elapsed=00:00:04.9251707
Result=M ary had a little lamb.
============Test7===============
Elapsed=00:00:07.9562379
Result=M ary had a little lamb.
============Test8===============
Elapsed=00:12:16.2918943
Result=M ary had a little lamb.
============Test9===============
Elapsed=00:00:02.0770277
Result=M ary had a little lamb.
============Test10===============
Elapsed=00:00:05.2721232
Result=M ary had a little lamb.
============Test11===============
Elapsed=00:00:05.2802903
Result=M ary had a little lamb.
BENCHMARK RESULTS
Methods Test1
, Test2
and Test5
are the fastest. Method Test8
is the slowest.
CODE
Here's the complete code of the benchmark:
private static HashSet<char> _invalidCharsHash;
private static HashSet<char> InvalidCharsHash
{
get { return _invalidCharsHash ?? (_invalidCharsHash = new HashSet<char>(Path.GetInvalidFileNameChars())); }
}
private static string _invalidCharsValue;
private static string InvalidCharsValue
{
get { return _invalidCharsValue ?? (_invalidCharsValue = new string(Path.GetInvalidFileNameChars())); }
}
private static char[] _invalidChars;
private static char[] InvalidChars
{
get { return _invalidChars ?? (_invalidChars = Path.GetInvalidFileNameChars()); }
}
static void Main(string[] args)
{
string testPath = "\"M <>\"\\a/ry/ h**ad:>> a\\/:*?\"<>| li*tt|le|| la\"mb.?";
int max = 1000000;
string newValue = "";
TimeBenchmark(max, Test1, testPath, newValue);
TimeBenchmark(max, Test2, testPath, newValue);
TimeBenchmark(max, Test3, testPath, newValue);
TimeBenchmark(max, Test4, testPath, newValue);
TimeBenchmark(max, Test5, testPath, newValue);
TimeBenchmark(max, Test6, testPath, newValue);
TimeBenchmark(max, Test7, testPath, newValue);
TimeBenchmark(max, Test8, testPath, newValue);
TimeBenchmark(max, Test9, testPath, newValue);
TimeBenchmark(max, Test10, testPath, newValue);
TimeBenchmark(max, Test11, testPath, newValue);
Console.Read();
}
private static void TimeBenchmark(int maxLoop, Func<string, string, string> func, string testString, string newValue)
{
var sw = new Stopwatch();
sw.Start();
string result = string.Empty;
for (int i = 0; i < maxLoop; i++)
result = func?.Invoke(testString, newValue);
sw.Stop();
Console.WriteLine($"============{func.Method.Name}===============");
Console.WriteLine("Elapsed={0}", sw.Elapsed);
Console.WriteLine("Result={0}", result);
Console.WriteLine("");
}
private static string Test1(string fileName, string newValue)
{
char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
char[] chars = fileName.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
if (InvalidCharsHash.Contains(chars[i]))
chars[i] = newChar;
}
return new string(chars);
}
private static string Test2(string fileName, string newValue)
{
char newChar = string.IsNullOrEmpty(newValue) ? char.MinValue : newValue[0];
bool remove = newChar == char.MinValue;
char[] chars = fileName.ToCharArray();
char[] newChars = new char[chars.Length];
int i2 = 0;
for (int i = 0; i < chars.Length; i++)
{
char c = chars[i];
if (InvalidCharsHash.Contains(c))
{
if (!remove)
newChars[i2++] = newChar;
}
else
newChars[i2++] = c;
}
return new string(newChars, 0, i2);
}
private static string Test3(string filename, string newValue)
{
foreach (char c in InvalidCharsValue)
{
filename = filename.Replace(c.ToString(), newValue);
}
return filename;
}
private static string Test4(string filename, string newValue)
{
Regex r = new Regex(string.Format("[{0}]", Regex.Escape(InvalidCharsValue)));
filename = r.Replace(filename, newValue);
return filename;
}
private static string Test5(string filename, string newValue)
{
return string.Join(newValue, filename.Split(InvalidChars));
}
private static string Test6(string fileName, string newValue)
{
return InvalidChars.Aggregate(fileName, (current, c) => current.Replace(c.ToString(), newValue));
}
private static string Test7(string fileName, string newValue)
{
string regex = string.Format("[{0}]", Regex.Escape(InvalidCharsValue));
return Regex.Replace(fileName, regex, newValue, RegexOptions.Compiled);
}
private static string Test8(string fileName, string newValue)
{
string regex = string.Format("[{0}]", Regex.Escape(InvalidCharsValue));
Regex removeInvalidChars = new Regex(regex, RegexOptions.Singleline | RegexOptions.Compiled | RegexOptions.CultureInvariant);
return removeInvalidChars.Replace(fileName, newValue);
}
private static string Test9(string fileName, string newValue)
{
StringBuilder sb = new StringBuilder(fileName.Length);
bool changed = false;
for (int i = 0; i < fileName.Length; i++)
{
char c = fileName[i];
if (InvalidCharsHash.Contains(c))
{
changed = true;
sb.Append(newValue);
}
else
sb.Append(c);
}
if (sb.Length == 0)
return newValue;
return changed ? sb.ToString() : fileName;
}
private static string Test10(string fileName, string newValue)
{
if (!fileName.Any(c => InvalidChars.Contains(c)))
{
return fileName;
}
return new string(fileName.Where(c => !InvalidChars.Contains(c)).ToArray());
}
private static string Test11(string fileName, string newValue)
{
string invalidCharsRemoved = new string(fileName
.Where(x => !InvalidChars.Contains(x))
.ToArray());
return invalidCharsRemoved;
}
GetInvalidFileNameChars()
will strip things like : \ etc from folder paths. – CAD blokePath.GetInvalidPathChars()
doesn't seem to strip*
or?
– CAD bloke