经验首页 前端设计 程序设计 Java相关 移动开发 数据库/运维 软件/图像 大数据/云计算 其他经验
当前位置:技术经验 » 大数据/云/AI » 人工智能基础 » 查看文章
ML.NET教程之客户细分(聚类问题)
来源:cnblogs  作者:Ken.W  时间:2019/1/2 9:27:41  对本文有异议

理解问题

客户细分需要解决的问题是按照客户之间的相似特征区分不同客户群体。这个问题的先决条件中没有可供使用的客户分类列表,只有客户的人物画像。

数据集

已有的数据是公司的历史商业活动记录以及客户的购买记录。
offer.csv:

  1. Offer #,Campaign,Varietal,Minimum Qty (kg),Discount (%),Origin,Past Peak
  2. 1,January,Malbec,72,56,France,FALSE
  3. 2,January,Pinot Noir,72,17,France,FALSE
  4. 3,February,Espumante,144,32,Oregon,TRUE
  5. 4,February,Champagne,72,48,France,TRUE
  6. 5,February,Cabernet Sauvignon,144,44,New Zealand,TRUE
  7. 6,March,Prosecco,144,86,Chile,FALSE
  8. 7,March,Prosecco,6,40,Australia,TRUE
  9. 8,March,Espumante,6,45,South Africa,FALSE
  10. 9,April,Chardonnay,144,57,Chile,FALSE
  11. 10,April,Prosecco,72,52,California,FALSE
  12. 11,May,Champagne,72,85,France,FALSE
  13. 12,May,Prosecco,72,83,Australia,FALSE
  14. 13,May,Merlot,6,43,Chile,FALSE
  15. 14,June,Merlot,72,64,Chile,FALSE
  16. 15,June,Cabernet Sauvignon,144,19,Italy,FALSE
  17. 16,June,Merlot,72,88,California,FALSE
  18. 17,July,Pinot Noir,12,47,Germany,FALSE
  19. 18,July,Espumante,6,50,Oregon,FALSE
  20. 19,July,Champagne,12,66,Germany,FALSE
  21. 20,August,Cabernet Sauvignon,72,82,Italy,FALSE
  22. 21,August,Champagne,12,50,California,FALSE
  23. 22,August,Champagne,72,63,France,FALSE
  24. 23,September,Chardonnay,144,39,South Africa,FALSE
  25. 24,September,Pinot Noir,6,34,Italy,FALSE
  26. 25,October,Cabernet Sauvignon,72,59,Oregon,TRUE
  27. 26,October,Pinot Noir,144,83,Australia,FALSE
  28. 27,October,Champagne,72,88,New Zealand,FALSE
  29. 28,November,Cabernet Sauvignon,12,56,France,TRUE
  30. 29,November,Pinot Grigio,6,87,France,FALSE
  31. 30,December,Malbec,6,54,France,FALSE
  32. 31,December,Champagne,72,89,France,FALSE
  33. 32,December,Cabernet Sauvignon,72,45,Germany,TRUE

transaction.csv:

  1. Customer Last Name,Offer #
  2. Smith,2
  3. Smith,24
  4. Johnson,17
  5. Johnson,24
  6. Johnson,26
  7. Williams,18
  8. Williams,22
  9. Williams,31
  10. Brown,7
  11. Brown,29
  12. Brown,30
  13. Jones,8
  14. Miller,6
  15. Miller,10
  16. Miller,14
  17. Miller,15
  18. Miller,22
  19. Miller,23
  20. Miller,31
  21. Davis,12
  22. Davis,22
  23. Davis,25
  24. Garcia,14
  25. Garcia,15
  26. Rodriguez,2
  27. Rodriguez,26
  28. Wilson,8
  29. Wilson,30
  30. Martinez,12
  31. Martinez,25
  32. Martinez,28
  33. Anderson,24
  34. Anderson,26
  35. Taylor,7
  36. Taylor,18
  37. Taylor,29
  38. Taylor,30
  39. Thomas,1
  40. Thomas,4
  41. Thomas,9
  42. Thomas,11
  43. Thomas,14
  44. Thomas,26
  45. Hernandez,28
  46. Hernandez,29
  47. Moore,17
  48. Moore,24
  49. Martin,2
  50. Martin,11
  51. Martin,28
  52. Jackson,1
  53. Jackson,2
  54. Jackson,11
  55. Jackson,15
  56. Jackson,22
  57. Thompson,9
  58. Thompson,16
  59. Thompson,25
  60. Thompson,30
  61. White,14
  62. White,22
  63. White,25
  64. White,30
  65. Lopez,9
  66. Lopez,11
  67. Lopez,15
  68. Lopez,16
  69. Lopez,27
  70. Lee,3
  71. Lee,4
  72. Lee,6
  73. Lee,22
  74. Lee,27
  75. Gonzalez,9
  76. Gonzalez,31
  77. Harris,4
  78. Harris,6
  79. Harris,7
  80. Harris,19
  81. Harris,22
  82. Harris,27
  83. Clark,4
  84. Clark,11
  85. Clark,28
  86. Clark,31
  87. Lewis,7
  88. Lewis,8
  89. Lewis,30
  90. Robinson,7
  91. Robinson,29
  92. Walker,18
  93. Walker,29
  94. Perez,18
  95. Perez,30
  96. Hall,11
  97. Hall,22
  98. Young,6
  99. Young,9
  100. Young,15
  101. Young,22
  102. Young,31
  103. Young,32
  104. Allen,9
  105. Allen,27
  106. Sanchez,4
  107. Sanchez,5
  108. Sanchez,14
  109. Sanchez,15
  110. Sanchez,20
  111. Sanchez,22
  112. Sanchez,26
  113. Wright,4
  114. Wright,6
  115. Wright,21
  116. Wright,27
  117. King,7
  118. King,13
  119. King,18
  120. King,29
  121. Scott,6
  122. Scott,14
  123. Scott,23
  124. Green,7
  125. Baker,7
  126. Baker,10
  127. Baker,19
  128. Baker,31
  129. Adams,18
  130. Adams,29
  131. Adams,30
  132. Nelson,3
  133. Nelson,4
  134. Nelson,8
  135. Nelson,31
  136. Hill,8
  137. Hill,13
  138. Hill,18
  139. Hill,30
  140. Ramirez,9
  141. Campbell,2
  142. Campbell,24
  143. Campbell,26
  144. Mitchell,1
  145. Mitchell,2
  146. Roberts,31
  147. Carter,7
  148. Carter,13
  149. Carter,29
  150. Carter,30
  151. Phillips,17
  152. Phillips,24
  153. Evans,22
  154. Evans,27
  155. Turner,4
  156. Turner,6
  157. Turner,27
  158. Turner,31
  159. Torres,8
  160. Parker,11
  161. Parker,16
  162. Parker,20
  163. Parker,29
  164. Parker,31
  165. Collins,11
  166. Collins,30
  167. Edwards,8
  168. Edwards,27
  169. Stewart,8
  170. Stewart,29
  171. Stewart,30
  172. Flores,17
  173. Flores,24
  174. Morris,17
  175. Morris,24
  176. Morris,26
  177. Nguyen,19
  178. Nguyen,31
  179. Murphy,7
  180. Murphy,12
  181. Rivera,7
  182. Rivera,18
  183. Cook,24
  184. Cook,26
  185. Rogers,3
  186. Rogers,7
  187. Rogers,8
  188. Rogers,19
  189. Rogers,21
  190. Rogers,22
  191. Morgan,8
  192. Morgan,29
  193. Peterson,1
  194. Peterson,2
  195. Peterson,10
  196. Peterson,23
  197. Peterson,26
  198. Peterson,27
  199. Cooper,4
  200. Cooper,16
  201. Cooper,20
  202. Cooper,32
  203. Reed,5
  204. Reed,14
  205. Bailey,7
  206. Bailey,30
  207. Bell,2
  208. Bell,17
  209. Bell,24
  210. Bell,26
  211. Gomez,11
  212. Gomez,20
  213. Gomez,25
  214. Gomez,32
  215. Kelly,6
  216. Kelly,20
  217. Kelly,31
  218. Kelly,32
  219. Howard,11
  220. Howard,12
  221. Howard,22
  222. Ward,4
  223. Cox,2
  224. Cox,17
  225. Cox,24
  226. Cox,26
  227. Diaz,7
  228. Diaz,8
  229. Diaz,29
  230. Diaz,30
  231. Richardson,3
  232. Richardson,6
  233. Richardson,22
  234. Wood,1
  235. Wood,10
  236. Wood,14
  237. Wood,31
  238. Watson,7
  239. Watson,29
  240. Brooks,3
  241. Brooks,8
  242. Brooks,11
  243. Brooks,22
  244. Bennett,8
  245. Bennett,29
  246. Gray,12
  247. Gray,16
  248. Gray,26
  249. James,7
  250. James,8
  251. James,13
  252. James,18
  253. James,30
  254. Reyes,9
  255. Reyes,23
  256. Cruz,29
  257. Cruz,30
  258. Hughes,7
  259. Hughes,8
  260. Hughes,13
  261. Hughes,29
  262. Hughes,30
  263. Price,1
  264. Price,22
  265. Price,30
  266. Price,31
  267. Myers,18
  268. Myers,30
  269. Long,3
  270. Long,7
  271. Long,10
  272. Foster,1
  273. Foster,9
  274. Foster,14
  275. Foster,22
  276. Foster,23
  277. Sanders,1
  278. Sanders,4
  279. Sanders,5
  280. Sanders,6
  281. Sanders,9
  282. Sanders,11
  283. Sanders,20
  284. Sanders,25
  285. Sanders,26
  286. Ross,18
  287. Ross,21
  288. Morales,6
  289. Morales,7
  290. Morales,8
  291. Morales,19
  292. Morales,22
  293. Morales,31
  294. Powell,5
  295. Sullivan,8
  296. Sullivan,13
  297. Sullivan,18
  298. Russell,26
  299. Ortiz,8
  300. Jenkins,24
  301. Jenkins,26
  302. Gutierrez,6
  303. Gutierrez,8
  304. Gutierrez,10
  305. Gutierrez,18
  306. Perry,8
  307. Perry,18
  308. Perry,29
  309. Perry,30
  310. Butler,1
  311. Butler,4
  312. Butler,22
  313. Butler,28
  314. Butler,30
  315. Barnes,10
  316. Barnes,21
  317. Barnes,22
  318. Barnes,31
  319. Fisher,1
  320. Fisher,2
  321. Fisher,11
  322. Fisher,22
  323. Fisher,28
  324. Fisher,30
  325. Fisher,31

预处理

需要对两个数据集做关联处理,这样才能得到单一的视图。同时由于需要比较客户所产生的交易,还需要建立一张透视表。行代表客户,列代表商业活动,单元格值则显示是否客户有购买行为。

  1. var offers = Offer.ReadFromCsv(_offersCsv);
  2. var transactions = Transaction.ReadFromCsv(_transactionsCsv);
  3. var clusterData = (from of in offers
  4. join tr in transactions on of.OfferId equals tr.OfferId
  5. select new
  6. {
  7. of.OfferId,
  8. of.Campaign,
  9. of.Discount,
  10. tr.LastName,
  11. of.LastPeak,
  12. of.Minimum,
  13. of.Origin,
  14. of.Varietal,
  15. Count = 1,
  16. }).ToArray();
  17. var count = offers.Count();
  18. var pivotDataArray =
  19. (from c in clusterData
  20. group c by c.LastName into gcs
  21. let lookup = gcs.ToLookup(y => y.OfferId, y => y.Count)
  22. select new PivotData()
  23. {
  24. LastName = gcs.Key,
  25. Features = ToFeatures(lookup, count)
  26. }).ToArray();

ToFeatures方法依据商业活动的数量,生成所需的特征数组。

  1. private static float[] ToFeatures(ILookup<string, int> lookup, int count)
  2. {
  3. var result = new float[count];
  4. foreach (var item in lookup)
  5. {
  6. var key = Convert.ToInt32(item.Key) - 1;
  7. result[key] = item.Sum();
  8. }
  9. return result;
  10. }

数据视图

取得用于生成视图的数组后,这里使用CreateStreamingDataView方法构建数据视图。而又因为Features属性是一个数组,所以必须声明其大小。

  1. var mlContext = new MLContext();
  2. var schemaDef = SchemaDefinition.Create(typeof(PivotData));
  3. schemaDef["Features"].ColumnType = new VectorType(NumberType.R4, count);
  4. var pivotDataView = mlContext.CreateStreamingDataView(pivotDataArray, schemaDef);

PCA

PCA(principal Component Analysis),主成分分析,是为了将过多的维度值减少至一个合适的范围以便于分析,这里是降到二维空间。

  1. new PrincipalComponentAnalysisEstimator(mlContext, "Features", "PCAFeatures", rank: 2)

OneHotEncoding

One Hot Encoding在此处的作用是将LastName从字符串转换为数字矩阵。

  1. new OneHotEncodingEstimator(mlContext, new[] { new OneHotEncodingEstimator.ColumnInfo("LastName", "LastNameKey", OneHotEncodingTransformer.OutputKind.Ind) })

训练器

K-Means是常用的应对聚类问题的训练器,这里假设要分为三类。

  1. mlContext.Clustering.Trainers.KMeans("Features", clustersCount: 3)

训练模型

  1. trainingPipeline.Fit(pivotDataView);

评估模型

  1. var predictions = trainedModel.Transform(pivotDataView);
  2. var metrics = mlContext.Clustering.Evaluate(predictions, score: "Score", features: "Features");
  3. Console.WriteLine($"*************************************************");
  4. Console.WriteLine($"* Metrics for {trainer} clustering model ");
  5. Console.WriteLine($"*------------------------------------------------");
  6. Console.WriteLine($"* AvgMinScore: {metrics.AvgMinScore}");
  7. Console.WriteLine($"* DBI is: {metrics.Dbi}");
  8. Console.WriteLine($"*************************************************");

可得到如下的评估结果。

  1. *************************************************
  2. * Metrics for Microsoft.ML.Trainers.KMeans.KMeansPlusPlusTrainer clustering model
  3. *------------------------------------------------
  4. * AvgMinScore: 2.3154067927599
  5. * DBI is: 2.69100740819456
  6. *************************************************

使用模型

  1. var clusteringPredictions = predictions
  2. .AsEnumerable<ClusteringPrediction>(mlContext, false)
  3. .ToArray();

画图

为了更直观地观察,可以用OxyPlot类库生成结果图片。

添加类库:

  1. dotnet add package OxyPlot.Core

Plot生成处理:

  1. var plot = new PlotModel { Title = "Customer Segmentation", IsLegendVisible = true };
  2. var clusters = clusteringPredictions.Select(p => p.SelectedClusterId).Distinct().OrderBy(x => x);
  3. foreach (var cluster in clusters)
  4. {
  5. var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true };
  6. var series = clusteringPredictions
  7. .Where(p => p.SelectedClusterId == cluster)
  8. .Select(p => new ScatterPoint(p.Location[0], p.Location[1])).ToArray();
  9. scatter.Points.AddRange(series);
  10. plot.Series.Add(scatter);
  11. }
  12. plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors;
  13. var exporter = new SvgExporter { Width = 600, Height = 400 };
  14. using (var fs = new System.IO.FileStream(_plotSvg, System.IO.FileMode.Create))
  15. {
  16. exporter.Export(plot, fs);
  17. }

最后的图片如下所示:

完整示例代码

Program类:

  1. using CustomerSegmentation.DataStructures;
  2. using Microsoft.ML;
  3. using System;
  4. using System.IO;
  5. using System.Linq;
  6. using Microsoft.ML.Runtime.Api;
  7. using Microsoft.ML.Transforms.Projections;
  8. using Microsoft.ML.Transforms.Categorical;
  9. using Microsoft.ML.Runtime.Data;
  10. using OxyPlot;
  11. using OxyPlot.Series;
  12. using Microsoft.ML.Core.Data;
  13. namespace CustomerSegmentation
  14. {
  15. class Program
  16. {
  17. private static float[] ToFeatures(ILookup<string, int> lookup, int count)
  18. {
  19. var result = new float[count];
  20. foreach (var item in lookup)
  21. {
  22. var key = Convert.ToInt32(item.Key) - 1;
  23. result[key] = item.Sum();
  24. }
  25. return result;
  26. }
  27. static readonly string _offersCsv = Path.Combine(Environment.CurrentDirectory, "assets", "offers.csv");
  28. static readonly string _transactionsCsv = Path.Combine(Environment.CurrentDirectory, "assets", "transactions.csv");
  29. static readonly string _plotSvg = Path.Combine(Environment.CurrentDirectory, "assets", "customerSegmentation.svg");
  30. static void Main(string[] args)
  31. {
  32. var offers = Offer.ReadFromCsv(_offersCsv);
  33. var transactions = Transaction.ReadFromCsv(_transactionsCsv);
  34. var clusterData = (from of in offers
  35. join tr in transactions on of.OfferId equals tr.OfferId
  36. select new
  37. {
  38. of.OfferId,
  39. of.Campaign,
  40. of.Discount,
  41. tr.LastName,
  42. of.LastPeak,
  43. of.Minimum,
  44. of.Origin,
  45. of.Varietal,
  46. Count = 1,
  47. }).ToArray();
  48. var count = offers.Count();
  49. var pivotDataArray =
  50. (from c in clusterData
  51. group c by c.LastName into gcs
  52. let lookup = gcs.ToLookup(y => y.OfferId, y => y.Count)
  53. select new PivotData()
  54. {
  55. LastName = gcs.Key,
  56. Features = ToFeatures(lookup, count)
  57. }).ToArray();
  58. var mlContext = new MLContext();
  59. var schemaDef = SchemaDefinition.Create(typeof(PivotData));
  60. schemaDef["Features"].ColumnType = new VectorType(NumberType.R4, count);
  61. var pivotDataView = mlContext.CreateStreamingDataView(pivotDataArray, schemaDef);
  62. var dataProcessPipeline = new PrincipalComponentAnalysisEstimator(mlContext, "Features", "PCAFeatures", rank: 2)
  63. .Append(new OneHotEncodingEstimator(mlContext,
  64. new[] { new OneHotEncodingEstimator.ColumnInfo("LastName", "LastNameKey", OneHotEncodingTransformer.OutputKind.Ind) }));
  65. var trainer = mlContext.Clustering.Trainers.KMeans("Features", clustersCount: 3);
  66. var trainingPipeline = dataProcessPipeline.Append(trainer);
  67. ITransformer trainedModel = trainingPipeline.Fit(pivotDataView);
  68. var predictions = trainedModel.Transform(pivotDataView);
  69. var metrics = mlContext.Clustering.Evaluate(predictions, score: "Score", features: "Features");
  70. Console.WriteLine($"*************************************************");
  71. Console.WriteLine($"* Metrics for {trainer} clustering model ");
  72. Console.WriteLine($"*------------------------------------------------");
  73. Console.WriteLine($"* AvgMinScore: {metrics.AvgMinScore}");
  74. Console.WriteLine($"* DBI is: {metrics.Dbi}");
  75. Console.WriteLine($"*************************************************");
  76. var clusteringPredictions = predictions
  77. .AsEnumerable<ClusteringPrediction>(mlContext, false)
  78. .ToArray();
  79. var plot = new PlotModel { Title = "Customer Segmentation", IsLegendVisible = true };
  80. var clusters = clusteringPredictions.Select(p => p.SelectedClusterId).Distinct().OrderBy(x => x);
  81. foreach (var cluster in clusters)
  82. {
  83. var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true };
  84. var series = clusteringPredictions
  85. .Where(p => p.SelectedClusterId == cluster)
  86. .Select(p => new ScatterPoint(p.Location[0], p.Location[1])).ToArray();
  87. scatter.Points.AddRange(series);
  88. plot.Series.Add(scatter);
  89. }
  90. plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors;
  91. var exporter = new SvgExporter { Width = 600, Height = 400 };
  92. using (var fs = new System.IO.FileStream(_plotSvg, System.IO.FileMode.Create))
  93. {
  94. exporter.Export(plot, fs);
  95. }
  96. Console.Read();
  97. }
  98. }
  99. }

Offer类:

  1. using System.Collections.Generic;
  2. using System.IO;
  3. using System.Linq;
  4. namespace CustomerSegmentation.DataStructures
  5. {
  6. public class Offer
  7. {
  8. //Offer #,Campaign,Varietal,Minimum Qty (kg),Discount (%),Origin,Past Peak
  9. public string OfferId { get; set; }
  10. public string Campaign { get; set; }
  11. public string Varietal { get; set; }
  12. public float Minimum { get; set; }
  13. public float Discount { get; set; }
  14. public string Origin { get; set; }
  15. public string LastPeak { get; set; }
  16. public static IEnumerable<Offer> ReadFromCsv(string file)
  17. {
  18. return File.ReadAllLines(file)
  19. .Skip(1) // skip header
  20. .Select(x => x.Split(','))
  21. .Select(x => new Offer()
  22. {
  23. OfferId = x[0],
  24. Campaign = x[1],
  25. Varietal = x[2],
  26. Minimum = float.Parse(x[3]),
  27. Discount = float.Parse(x[4]),
  28. Origin = x[5],
  29. LastPeak = x[6]
  30. });
  31. }
  32. }
  33. }

Transaction类:

  1. using System.Collections.Generic;
  2. using System.IO;
  3. using System.Linq;
  4. namespace CustomerSegmentation.DataStructures
  5. {
  6. public class Transaction
  7. {
  8. //Customer Last Name,Offer #
  9. //Smith,2
  10. public string LastName { get; set; }
  11. public string OfferId { get; set; }
  12. public static IEnumerable<Transaction> ReadFromCsv(string file)
  13. {
  14. return File.ReadAllLines(file)
  15. .Skip(1) // skip header
  16. .Select(x => x.Split(','))
  17. .Select(x => new Transaction()
  18. {
  19. LastName = x[0],
  20. OfferId = x[1],
  21. });
  22. }
  23. }
  24. }

PivotData类:

  1. namespace CustomerSegmentation.DataStructures
  2. {
  3. public class PivotData
  4. {
  5. public float[] Features;
  6. public string LastName;
  7. }
  8. }

ClusteringPrediction类:

  1. using Microsoft.ML.Runtime.Api;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. namespace CustomerSegmentation.DataStructures
  6. {
  7. public class ClusteringPrediction
  8. {
  9. [ColumnName("PredictedLabel")]
  10. public uint SelectedClusterId;
  11. [ColumnName("Score")]
  12. public float[] Distance;
  13. [ColumnName("PCAFeatures")]
  14. public float[] Location;
  15. [ColumnName("LastName")]
  16. public string LastName;
  17. }
  18. }
 友情链接:直通硅谷  点职佳  北美留学生论坛

本站QQ群:前端 618073944 | Java 606181507 | Python 626812652 | C/C++ 612253063 | 微信 634508462 | 苹果 692586424 | C#/.net 182808419 | PHP 305140648 | 运维 608723728

W3xue 的所有内容仅供测试,对任何法律问题及风险不承担任何责任。通过使用本站内容随之而来的风险与本站无关。
关于我们  |  意见建议  |  捐助我们  |  报错有奖  |  广告合作、友情链接(目前9元/月)请联系QQ:27243702 沸活量
皖ICP备17017327号-2 皖公网安备34020702000426号