DataFrame:通过SparkSql将scala类转为DataFrame的方法


Posted in Python onJanuary 29, 2019

如下所示:

import java.text.DecimalFormat
import com.alibaba.fastjson.JSON
import com.donews.data.AppConfig
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Row, SaveMode, DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
 
/**
 * Created by silentwolf on 2016/6/3.
 */
 
case class UserTag(SUUID: String,
     MAN: Float,
     WOMAN: Float,
     AGE10_19: Float,
     AGE20_29: Float,
     AGE30_39: Float,
     AGE40_49: Float,
     AGE50_59: Float,
     GAME: Float,
     MOVIE: Float,
     MUSIC: Float,
     ART: Float,
     POLITICS_NEWS: Float,
     FINANCIAL: Float,
     EDUCATION_TRAINING: Float,
     HEALTH_CARE: Float,
     TRAVEL: Float,
     AUTOMOBILE: Float,
     HOUSE_PROPERTY: Float,
     CLOTHING_ACCESSORIES: Float,
     BEAUTY: Float,
     IT: Float,
     BABY_PRODUCT: Float,
     FOOD_SERVICE: Float,
     HOME_FURNISHING: Float,
     SPORTS: Float,
     OUTDOOR_ACTIVITIES: Float,
     MEDICINE: Float
     )
 
object UserTagTable {
 
 val LOG = LoggerFactory.getLogger(UserOverviewFirst.getClass)
 
 val REP_HOME = s"${AppConfig.HDFS_MASTER}/${AppConfig.HDFS_REP}"
 
 def main(args: Array[String]) {
 
 var startTime = System.currentTimeMillis()
 
 val conf: com.typesafe.config.Config = ConfigFactory.load()
 
 val sc = new SparkContext()
 
 val sqlContext = new SQLContext(sc)
 
 var df1: DataFrame = null
 
 if (args.length == 0) {
  println("请输入: appkey , StartTime : 2016-04-10 ,StartEnd :2016-04-11")
 }
 else {
 
  var appkey = args(0)
 
  var lastdate = args(1)
 
  df1 = loadDataFrame(sqlContext, appkey, "2016-04-10", lastdate)
 
  df1.registerTempTable("suuidTable")
 
  sqlContext.udf.register("taginfo", (a: String) => userTagInfo(a))
  sqlContext.udf.register("intToString", (b: Long) => intToString(b))
  import sqlContext.implicits._
 
  //***重点***:将临时表中的suuid和自定函数中Json数据,放入UserTag中。
 sqlContext.sql(" select distinct(suuid) AS suuid,taginfo(suuid) from suuidTable group by suuid").map { case Row(suuid: String, taginfo: String) =>
  val taginfoObj = JSON.parseObject(taginfo)
  UserTag(suuid.toString,
   taginfoObj.getFloat("man"),
   taginfoObj.getFloat("woman"),
   taginfoObj.getFloat("age10_19"),
   taginfoObj.getFloat("age20_29"),
   taginfoObj.getFloat("age30_39"),
   taginfoObj.getFloat("age40_49"),
   taginfoObj.getFloat("age50_59"),
   taginfoObj.getFloat("game"),
   taginfoObj.getFloat("movie"),
   taginfoObj.getFloat("music"),
   taginfoObj.getFloat("art"),
   taginfoObj.getFloat("politics_news"),
   taginfoObj.getFloat("financial"),
   taginfoObj.getFloat("education_training"),
   taginfoObj.getFloat("health_care"),
   taginfoObj.getFloat("travel"),
   taginfoObj.getFloat("automobile"),
   taginfoObj.getFloat("house_property"),
   taginfoObj.getFloat("clothing_accessories"),
   taginfoObj.getFloat("beauty"),
   taginfoObj.getFloat("IT"),
   taginfoObj.getFloat("baby_Product"),
   taginfoObj.getFloat("food_service"),
   taginfoObj.getFloat("home_furnishing"),
   taginfoObj.getFloat("sports"),
   taginfoObj.getFloat("outdoor_activities"),
   taginfoObj.getFloat("medicine")
  )}.toDF().registerTempTable("resultTable")
 
  val resultDF = sqlContext.sql(s"select '$appkey' AS APPKEY, '$lastdate' AS DATE,SUUID ,MAN,WOMAN,AGE10_19,AGE20_29,AGE30_39 ," +
  "AGE40_49 ,AGE50_59,GAME,MOVIE,MUSIC,ART,POLITICS_NEWS,FINANCIAL,EDUCATION_TRAINING,HEALTH_CARE,TRAVEL,AUTOMOBILE," +
  "HOUSE_PROPERTY,CLOTHING_ACCESSORIES,BEAUTY,IT,BABY_PRODUCT ,FOOD_SERVICE ,HOME_FURNISHING ,SPORTS ,OUTDOOR_ACTIVITIES ," +
  "MEDICINE from resultTable WHERE SUUID IS NOT NULL")
  resultDF.write.mode(SaveMode.Overwrite).options(
  Map("table" -> "USER_TAGS", "zkUrl" -> conf.getString("Hbase.url"))
  ).format("org.apache.phoenix.spark").save()
 
 }
 }
 
 def intToString(suuid: Long): String = {
 suuid.toString()
 }
 
 def userTagInfo(num1: String): String = {
 
 var de = new DecimalFormat("0.00")
 var mannum = de.format(math.random).toFloat
 var man = mannum
 var woman = de.format(1 - mannum).toFloat
 
 var age10_19num = de.format(math.random * 0.2).toFloat
 var age20_29num = de.format(math.random * 0.2).toFloat
 var age30_39num = de.format(math.random * 0.2).toFloat
 var age40_49num = de.format(math.random * 0.2).toFloat
 
 var age10_19 = age10_19num
 var age20_29 = age20_29num
 var age30_39 = age30_39num
 var age40_49 = age40_49num
 var age50_59 = de.format(1 - age10_19num - age20_29num - age30_39num - age40_49num).toFloat
 
 var game = de.format(math.random * 1).toFloat
 var movie = de.format(math.random * 1).toFloat
 var music = de.format(math.random * 1).toFloat
 var art = de.format(math.random * 1).toFloat
 var politics_news = de.format(math.random * 1).toFloat
 
 var financial = de.format(math.random * 1).toFloat
 var education_training = de.format(math.random * 1).toFloat
 var health_care = de.format(math.random * 1).toFloat
 var travel = de.format(math.random * 1).toFloat
 var automobile = de.format(math.random * 1).toFloat
 
 var house_property = de.format(math.random * 1).toFloat
 var clothing_accessories = de.format(math.random * 1).toFloat
 var beauty = de.format(math.random * 1).toFloat
 var IT = de.format(math.random * 1).toFloat
 var baby_Product = de.format(math.random * 1).toFloat
 
 var food_service = de.format(math.random * 1).toFloat
 var home_furnishing = de.format(math.random * 1).toFloat
 var sports = de.format(math.random * 1).toFloat
 var outdoor_activities = de.format(math.random * 1).toFloat
 var medicine = de.format(math.random * 1).toFloat
 
 "{" + "\"man\"" + ":" + man + "," + "\"woman\"" + ":" + woman + "," + "\"age10_19\"" + ":" + age10_19 + "," + "\"age20_29\"" + ":" + age20_29 + "," +
  "\"age30_39\"" + ":" + age30_39 + "," + "\"age40_49\"" + ":" + age40_49 + "," + "\"age50_59\"" + ":" + age50_59 + "," + "\"game\"" + ":" + game + "," +
  "\"movie\"" + ":" + movie + "," + "\"music\"" + ":" + music + "," + "\"art\"" + ":" + art + "," + "\"politics_news\"" + ":" + politics_news + "," +
  "\"financial\"" + ":" + financial + "," + "\"education_training\"" + ":" + education_training + "," + "\"health_care\"" + ":" + health_care + "," +
  "\"travel\"" + ":" + travel + "," + "\"automobile\"" + ":" + automobile + "," + "\"house_property\"" + ":" + house_property + "," + "\"clothing_accessories\"" + ":" + clothing_accessories + "," +
  "\"beauty\"" + ":" + beauty + "," + "\"IT\"" + ":" + IT + "," + "\"baby_Product\"" + ":" + baby_Product + "," + "\"food_service\"" + ":" + food_service + "," +
  "\"home_furnishing\"" + ":" + home_furnishing + "," + "\"sports\"" + ":" + sports + "," + "\"outdoor_activities\"" + ":" + outdoor_activities + "," + "\"medicine\"" + ":" + medicine +
  "}";
 
 }
 
 def loadDataFrame(ctx: SQLContext, appkey: String, startDay: String, endDay: String): DataFrame = {
 val path = s"$REP_HOME/appstatistic"
 ctx.read.parquet(path)
  .filter(s"timestamp is not null and appkey='$appkey' and day>='$startDay' and day<='$endDay'")
 }
 
 
}

以上这篇DataFrame:通过SparkSql将scala类转为DataFrame的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python实现通过pil模块对图片格式进行转换的方法
Mar 24 Python
python集合类型用法分析
Apr 08 Python
Python学习笔记整理3之输入输出、python eval函数
Dec 14 Python
Python视频爬虫实现下载头条视频功能示例
May 07 Python
详解Python3的TFTP文件传输
Jun 26 Python
Python嵌套列表转一维的方法(压平嵌套列表)
Jul 03 Python
python实现动态创建类的方法分析
Jun 25 Python
解决yum对python依赖版本问题
Jul 05 Python
Python如何将装饰器定义为类
Jul 30 Python
Python如何定义有可选参数的元类
Jul 31 Python
python re.match()用法相关示例
Jan 27 Python
Python机器学习应用之基于线性判别模型的分类篇详解
Jan 18 Python
pandas去除重复列的实现方法
Jan 29 #Python
使用Python向C语言的链接库传递数组、结构体、指针类型的数据
Jan 29 #Python
pandas去重复行并分类汇总的实现方法
Jan 29 #Python
spark dataframe 将一列展开,把该列所有值都变成新列的方法
Jan 29 #Python
Python使用ctypes调用C/C++的方法
Jan 29 #Python
dataframe 按条件替换某一列中的值方法
Jan 29 #Python
Numpy之random函数使用学习
Jan 29 #Python
You might like
WordPress中&quot;无法将上传的文件移动至&quot;错误的解决方法
2015/07/01 PHP
Joomla使用Apache重写模式的方法
2016/05/04 PHP
PHP Cli 模式设置进程名称的方法
2019/06/12 PHP
[原创]图片分页查看
2006/08/28 Javascript
JavaScript学习历程和心得小结
2010/08/16 Javascript
js multiple全选与取消全选实现代码
2012/12/04 Javascript
javascript计时器事件使用详解
2014/01/07 Javascript
跟我学习JScript的Bug与内存管理
2015/11/18 Javascript
js对象浅拷贝和深拷贝详解
2016/09/05 Javascript
jquery radio的取值_radio的选中_radio的重置方法
2016/09/20 Javascript
Bootstrap Table从服务器加载数据进行显示的实现方法
2016/09/29 Javascript
js从输入框读取内容,比较两个数字的大小方法
2017/03/13 Javascript
Flask中获取小程序Request数据的两种方法
2017/05/12 Javascript
基于javascript 显式转换与隐式转换(详解)
2017/12/15 Javascript
从setTimeout看js函数执行过程
2017/12/19 Javascript
JS内部事件机制之单线程原理
2018/07/02 Javascript
使用webpack搭建vue项目及注意事项
2019/06/10 Javascript
vue keep-alive列表页缓存 详情页返回上一页不刷新,定位到之前位置
2019/11/26 Javascript
Map与WeakMap类型在JavaScript中的使用详解
2020/11/18 Javascript
Python使用py2exe打包程序介绍
2014/11/20 Python
Python re模块介绍
2014/11/30 Python
Python基于pygame实现图片代替鼠标移动效果
2015/11/11 Python
在Qt中正确的设置窗体的背景图片的几种方法总结
2019/06/19 Python
Django组件content-type使用方法详解
2019/07/19 Python
python实现图片上添加图片
2019/11/26 Python
Python selenium 自动化脚本打包成一个exe文件(推荐)
2020/01/14 Python
python统计函数库scipy.stats的用法解析
2020/02/25 Python
Python基于pip实现离线打包过程详解
2020/05/15 Python
python不同系统中打开方法
2020/06/23 Python
越南母婴用品购物网站:Kids Plaza
2020/04/09 全球购物
工商管理应届生求职信
2013/10/07 职场文书
党委书记个人对照检查材料
2014/09/15 职场文书
2014年宣传部工作总结
2014/11/12 职场文书
精神文明建设先进个人事迹材料
2014/12/24 职场文书
带你彻底理解JavaScript中的原型对象
2021/04/14 Javascript
Golang jwt身份认证
2022/04/20 Golang