DataFrame:通过SparkSql将scala类转为DataFrame的方法


Posted in Python onJanuary 29, 2019

如下所示:

import java.text.DecimalFormat
import com.alibaba.fastjson.JSON
import com.donews.data.AppConfig
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Row, SaveMode, DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
 
/**
 * Created by silentwolf on 2016/6/3.
 */
 
case class UserTag(SUUID: String,
     MAN: Float,
     WOMAN: Float,
     AGE10_19: Float,
     AGE20_29: Float,
     AGE30_39: Float,
     AGE40_49: Float,
     AGE50_59: Float,
     GAME: Float,
     MOVIE: Float,
     MUSIC: Float,
     ART: Float,
     POLITICS_NEWS: Float,
     FINANCIAL: Float,
     EDUCATION_TRAINING: Float,
     HEALTH_CARE: Float,
     TRAVEL: Float,
     AUTOMOBILE: Float,
     HOUSE_PROPERTY: Float,
     CLOTHING_ACCESSORIES: Float,
     BEAUTY: Float,
     IT: Float,
     BABY_PRODUCT: Float,
     FOOD_SERVICE: Float,
     HOME_FURNISHING: Float,
     SPORTS: Float,
     OUTDOOR_ACTIVITIES: Float,
     MEDICINE: Float
     )
 
object UserTagTable {
 
 val LOG = LoggerFactory.getLogger(UserOverviewFirst.getClass)
 
 val REP_HOME = s"${AppConfig.HDFS_MASTER}/${AppConfig.HDFS_REP}"
 
 def main(args: Array[String]) {
 
 var startTime = System.currentTimeMillis()
 
 val conf: com.typesafe.config.Config = ConfigFactory.load()
 
 val sc = new SparkContext()
 
 val sqlContext = new SQLContext(sc)
 
 var df1: DataFrame = null
 
 if (args.length == 0) {
  println("请输入: appkey , StartTime : 2016-04-10 ,StartEnd :2016-04-11")
 }
 else {
 
  var appkey = args(0)
 
  var lastdate = args(1)
 
  df1 = loadDataFrame(sqlContext, appkey, "2016-04-10", lastdate)
 
  df1.registerTempTable("suuidTable")
 
  sqlContext.udf.register("taginfo", (a: String) => userTagInfo(a))
  sqlContext.udf.register("intToString", (b: Long) => intToString(b))
  import sqlContext.implicits._
 
  //***重点***:将临时表中的suuid和自定函数中Json数据,放入UserTag中。
 sqlContext.sql(" select distinct(suuid) AS suuid,taginfo(suuid) from suuidTable group by suuid").map { case Row(suuid: String, taginfo: String) =>
  val taginfoObj = JSON.parseObject(taginfo)
  UserTag(suuid.toString,
   taginfoObj.getFloat("man"),
   taginfoObj.getFloat("woman"),
   taginfoObj.getFloat("age10_19"),
   taginfoObj.getFloat("age20_29"),
   taginfoObj.getFloat("age30_39"),
   taginfoObj.getFloat("age40_49"),
   taginfoObj.getFloat("age50_59"),
   taginfoObj.getFloat("game"),
   taginfoObj.getFloat("movie"),
   taginfoObj.getFloat("music"),
   taginfoObj.getFloat("art"),
   taginfoObj.getFloat("politics_news"),
   taginfoObj.getFloat("financial"),
   taginfoObj.getFloat("education_training"),
   taginfoObj.getFloat("health_care"),
   taginfoObj.getFloat("travel"),
   taginfoObj.getFloat("automobile"),
   taginfoObj.getFloat("house_property"),
   taginfoObj.getFloat("clothing_accessories"),
   taginfoObj.getFloat("beauty"),
   taginfoObj.getFloat("IT"),
   taginfoObj.getFloat("baby_Product"),
   taginfoObj.getFloat("food_service"),
   taginfoObj.getFloat("home_furnishing"),
   taginfoObj.getFloat("sports"),
   taginfoObj.getFloat("outdoor_activities"),
   taginfoObj.getFloat("medicine")
  )}.toDF().registerTempTable("resultTable")
 
  val resultDF = sqlContext.sql(s"select '$appkey' AS APPKEY, '$lastdate' AS DATE,SUUID ,MAN,WOMAN,AGE10_19,AGE20_29,AGE30_39 ," +
  "AGE40_49 ,AGE50_59,GAME,MOVIE,MUSIC,ART,POLITICS_NEWS,FINANCIAL,EDUCATION_TRAINING,HEALTH_CARE,TRAVEL,AUTOMOBILE," +
  "HOUSE_PROPERTY,CLOTHING_ACCESSORIES,BEAUTY,IT,BABY_PRODUCT ,FOOD_SERVICE ,HOME_FURNISHING ,SPORTS ,OUTDOOR_ACTIVITIES ," +
  "MEDICINE from resultTable WHERE SUUID IS NOT NULL")
  resultDF.write.mode(SaveMode.Overwrite).options(
  Map("table" -> "USER_TAGS", "zkUrl" -> conf.getString("Hbase.url"))
  ).format("org.apache.phoenix.spark").save()
 
 }
 }
 
 def intToString(suuid: Long): String = {
 suuid.toString()
 }
 
 def userTagInfo(num1: String): String = {
 
 var de = new DecimalFormat("0.00")
 var mannum = de.format(math.random).toFloat
 var man = mannum
 var woman = de.format(1 - mannum).toFloat
 
 var age10_19num = de.format(math.random * 0.2).toFloat
 var age20_29num = de.format(math.random * 0.2).toFloat
 var age30_39num = de.format(math.random * 0.2).toFloat
 var age40_49num = de.format(math.random * 0.2).toFloat
 
 var age10_19 = age10_19num
 var age20_29 = age20_29num
 var age30_39 = age30_39num
 var age40_49 = age40_49num
 var age50_59 = de.format(1 - age10_19num - age20_29num - age30_39num - age40_49num).toFloat
 
 var game = de.format(math.random * 1).toFloat
 var movie = de.format(math.random * 1).toFloat
 var music = de.format(math.random * 1).toFloat
 var art = de.format(math.random * 1).toFloat
 var politics_news = de.format(math.random * 1).toFloat
 
 var financial = de.format(math.random * 1).toFloat
 var education_training = de.format(math.random * 1).toFloat
 var health_care = de.format(math.random * 1).toFloat
 var travel = de.format(math.random * 1).toFloat
 var automobile = de.format(math.random * 1).toFloat
 
 var house_property = de.format(math.random * 1).toFloat
 var clothing_accessories = de.format(math.random * 1).toFloat
 var beauty = de.format(math.random * 1).toFloat
 var IT = de.format(math.random * 1).toFloat
 var baby_Product = de.format(math.random * 1).toFloat
 
 var food_service = de.format(math.random * 1).toFloat
 var home_furnishing = de.format(math.random * 1).toFloat
 var sports = de.format(math.random * 1).toFloat
 var outdoor_activities = de.format(math.random * 1).toFloat
 var medicine = de.format(math.random * 1).toFloat
 
 "{" + "\"man\"" + ":" + man + "," + "\"woman\"" + ":" + woman + "," + "\"age10_19\"" + ":" + age10_19 + "," + "\"age20_29\"" + ":" + age20_29 + "," +
  "\"age30_39\"" + ":" + age30_39 + "," + "\"age40_49\"" + ":" + age40_49 + "," + "\"age50_59\"" + ":" + age50_59 + "," + "\"game\"" + ":" + game + "," +
  "\"movie\"" + ":" + movie + "," + "\"music\"" + ":" + music + "," + "\"art\"" + ":" + art + "," + "\"politics_news\"" + ":" + politics_news + "," +
  "\"financial\"" + ":" + financial + "," + "\"education_training\"" + ":" + education_training + "," + "\"health_care\"" + ":" + health_care + "," +
  "\"travel\"" + ":" + travel + "," + "\"automobile\"" + ":" + automobile + "," + "\"house_property\"" + ":" + house_property + "," + "\"clothing_accessories\"" + ":" + clothing_accessories + "," +
  "\"beauty\"" + ":" + beauty + "," + "\"IT\"" + ":" + IT + "," + "\"baby_Product\"" + ":" + baby_Product + "," + "\"food_service\"" + ":" + food_service + "," +
  "\"home_furnishing\"" + ":" + home_furnishing + "," + "\"sports\"" + ":" + sports + "," + "\"outdoor_activities\"" + ":" + outdoor_activities + "," + "\"medicine\"" + ":" + medicine +
  "}";
 
 }
 
 def loadDataFrame(ctx: SQLContext, appkey: String, startDay: String, endDay: String): DataFrame = {
 val path = s"$REP_HOME/appstatistic"
 ctx.read.parquet(path)
  .filter(s"timestamp is not null and appkey='$appkey' and day>='$startDay' and day<='$endDay'")
 }
 
 
}

以上这篇DataFrame:通过SparkSql将scala类转为DataFrame的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python断言assert的用法代码解析
Feb 03 Python
Python Selenium Cookie 绕过验证码实现登录示例代码
Apr 10 Python
python使用__slots__让你的代码更加节省内存
Sep 05 Python
python实现多进程代码示例
Oct 31 Python
Python进程间通信Queue消息队列用法分析
May 22 Python
解决pycharm 工具栏Tool中找不到Run manager.py Task的问题
Jul 01 Python
详解Python Qt的窗体开发的基本操作
Jul 14 Python
Python测试模块doctest使用解析
Aug 10 Python
解决Jupyter NoteBook输出的图表太小看不清问题
Apr 16 Python
浅谈keras通过model.fit_generator训练模型(节省内存)
Jun 17 Python
python函数指定默认值的实例讲解
Mar 29 Python
Python数据清洗工具之Numpy的基本操作
Apr 22 Python
pandas去除重复列的实现方法
Jan 29 #Python
使用Python向C语言的链接库传递数组、结构体、指针类型的数据
Jan 29 #Python
pandas去重复行并分类汇总的实现方法
Jan 29 #Python
spark dataframe 将一列展开,把该列所有值都变成新列的方法
Jan 29 #Python
Python使用ctypes调用C/C++的方法
Jan 29 #Python
dataframe 按条件替换某一列中的值方法
Jan 29 #Python
Numpy之random函数使用学习
Jan 29 #Python
You might like
让这部DC动画新作刷新你的认知
2020/03/03 欧美动漫
数字转英文
2006/12/06 PHP
php 文章采集正则代码
2009/12/28 PHP
探讨如何在php168_cms中提取验证码
2013/06/08 PHP
ThinkPHP表单自动验证实例
2014/10/13 PHP
标准PHP的AES加密算法类
2015/03/12 PHP
PHP添加PNG图片背景透明水印操作类定义与用法示例
2019/03/12 PHP
PHP中str_split()函数的用法讲解
2019/04/11 PHP
PhpStorm 如何优雅的调试Hyperf的方法步骤
2019/11/24 PHP
通过JAVASCRIPT读取ASP设定的COOKIE
2006/11/24 Javascript
Javascript结合css实现网页换肤功能
2009/11/02 Javascript
基于node.js的快速开发透明代理
2010/12/25 Javascript
js substr支持中文截取函数代码(中文是双字节)
2013/04/17 Javascript
javascript 数组排序函数sort和reverse使用介绍
2013/11/21 Javascript
jquery实现简单的表单验证
2015/11/17 Javascript
浅谈JavaScript 标准对象
2016/06/02 Javascript
canvas实现动态小球重叠效果
2017/02/06 Javascript
nodejs搭建本地服务器并访问文件的方法
2017/03/03 NodeJs
JavaScript深拷贝和浅拷贝概念与用法实例分析
2018/06/07 Javascript
JS实现方形抽奖效果
2018/08/27 Javascript
关于IDEA中的.VUE文件报错 Export declarations are not supported by current JavaScript version
2020/10/17 Javascript
[15:15]教你分分钟做大人:狙击手
2014/10/30 DOTA
Django 前后台的数据传递的方法
2017/08/08 Python
Python2与python3中 for 循环语句基础与实例分析
2017/11/20 Python
python登录并爬取淘宝信息代码示例
2017/12/09 Python
Python(Django)项目与Apache的管理交互的方法
2018/05/16 Python
pytorch 调整某一维度数据顺序的方法
2018/12/08 Python
解决sublime+python3无法输出中文的问题
2018/12/12 Python
TensorFlow的reshape操作 tf.reshape的实现
2020/04/19 Python
CSS3 icon font完全指南(CSS3 font 会取代icon图标)
2013/01/06 HTML / CSS
借助HTML5 Canvas来绘制三角形和矩形等多边形的方法
2016/03/14 HTML / CSS
HTML5头部标签的一些常用信息小结
2016/10/23 HTML / CSS
美国紧身牛仔裤品牌:NYDJ
2017/05/24 全球购物
小女主人连衣裙:Little Mistress
2017/07/10 全球购物
马德里著名的运动鞋商店:NOIRFONCE
2019/04/12 全球购物
店长助理岗位职责
2013/12/13 职场文书