import tensorflow as tf
_CSV_COLUMNS = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
_CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
[0], [0], [0], [''], ['']]
_NUM_EXAMPLES = {
'train': 32561,
'validation': 16281,
}
# 1. Read the Census Data
# 2. Converting Data into Tensors
def input_fn(data_file, num_epochs, shuffle, batch_size):
"""为Estimator创建一个input function"""
assert tf.gfile.Exists(data_file), "{0} not found.".format(data_file)
def parse_csv(line):
print("Parsing", data_file)
# tf.decode_csv会把csv文件转换成很a list of Tensor,一列一个。record_defaults用于指明每一列的缺失值用什么填充
columns = tf.decode_csv(line, record_defaults=_CSV_COLUMN_DEFAULTS)
features = dict(zip(_CSV_COLUMNS, columns))
labels = features.pop('income_bracket')
return features, tf.equal(labels, '>50K') # tf.equal(x, y) 返回一个bool类型Tensor, 表示x == y, element-wise
dataset = tf.data.TextLineDataset(data_file) \
.map(parse_csv, num_parallel_calls=5)
if shuffle:
dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'] + _NUM_EXAMPLES['validation'])
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
# 3. Select and Engineer Features for Model
## 3.1 Base Categorical Feature Columns
# 如果我们知道所有的取值,并且取值不是很多
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
'relationship', [
'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
'Other-relative'
]
)
# 如果不知道有多少取值
occupation = tf.feature_column.categorical_column_with_hash_bucket(
'occupation', hash_bucket_size=1000
)
education = tf.feature_column.categorical_column_with_vocabulary_list(
'education', [
'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th'
]
)
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
'marital_status', [
'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed']
)
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
'workclass', [
'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
# 3.2 Base Continuous Feature Columns
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')
#Sometimes the relationship between a continuous feature and the label is not linear. As a hypothetical example, a person's income may grow with age in the early stage of one's career, then the growth may slow at some point, and finally the income decreases after retirement. In this scenario, using the raw age as a real-valued feature column might not be a good choice because the model can only learn one of the three cases:
# 3.2.1 连续特征离散化
# 之所以这么做是因为:有些时候连续特征和label之间不是线性的关系。可能刚开始是正的线性关系,后面又变成了负的线性关系,这样一个折线的关系整体来看就不再是线性关系。
# bucketization 装桶
# 10个边界,11个桶
age_buckets = tf.feature_column.bucketized_column(
age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# 3.3 组合特征/交叉特征
education_x_occupation = tf.feature_column.crossed_column(
['education', 'occupation'], hash_bucket_size=1000)
age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
[age_buckets, 'education', 'occupation'], hash_bucket_size=1000
)
# 4. 模型
"""
之前的特征:
1. CategoricalColumn
2. NumericalColumn
3. BucketizedColumn
4. CrossedColumn
这些特征都是FeatureColumn的子类,可以放到一起
"""
base_columns = [
education, marital_status, relationship, workclass, occupation,
age_buckets,
]
crossed_column = [
tf.feature_column.crossed_column(
['education', 'occupation'], hash_bucket_size=1000
),
tf.feature_column.crossed_column(
[age_buckets, 'education', 'occupation'], hash_bucket_size=1000
)
]
model_dir = "./model/wide_component"
model = tf.estimator.LinearClassifier(
model_dir=model_dir, feature_columns=base_columns + crossed_column
)
train_file = './data/adult.data'
val_file = './data/adult.data'
test_file = './data/adult.test'
# 5. Train & Evaluate & Predict
model.train(input_fn=lambda: input_fn(data_file=train_file, num_epochs=1, shuffle=True, batch_size=512))
results = model.evaluate(input_fn=lambda: input_fn(val_file, 1, False, 512))
for key in sorted(results):
print("{0:20}: {1:.4f}".format(key, results[key]))
pred_iter = model.predict(input_fn=lambda: input_fn(test_file, 1, False, 1))
for pred in pred_iter:
print(pred)
break #太多了,只打印一条
test_results = model.evaluate(input_fn=lambda: input_fn(test_file, 1, False, 512))
for key in sorted(test_results):
print("{0:20}: {1:.4f}".format(key, test_results[key]))
# 6. 正则化
model = tf.estimator.LinearClassifier(
feature_columns=base_columns + crossed_column, model_dir=model_dir,
optimizer=tf.train.FtrlOptimizer(
learning_rate=0.1,
l1_regularization_strength=1.0,
l2_regularization_strength=1.0
)
)
# if __name__ == '__main__':
# print(tf.VERSION)
# data_file = './data/adult.data'
# next_batch = input_fn(data_file, num_epochs=1, shuffle=True, batch_size=5)
# with tf.Session() as sess:
# first_batch = sess.run(next_batch)
# print(first_batch[0])
# print(first_batch[1])
没有合适的资源?快使用搜索试试~ 我知道了~
Google论文"Wide & Deep Learning for Recommender Systems"全套工程文件+数据集...
共57个文件
20160331-6353:12个
meta:8个
index:8个
5星 · 超过95%的资源 需积分: 45 39 下载量 42 浏览量
2018-08-11
16:58:25
上传
评论 3
收藏 2.33MB RAR 举报
温馨提示
google于2016年发表的论文"Wide & Deep Learning for Recommender Systems"里面的代码; 全套工程文件+数据集+调试过程; 经过tf版本升级之后,通过调试,没有问题; 其他小问题可以参考我写的“调试过程”
资源推荐
资源详情
资源评论
收起资源包目录
wide_deep.rar (57个子文件)
wide_deep
wide_deep
data_download.py 2KB
wide_deep代码调试记录.html 882B
wide_component.py 6KB
my_test.py 402B
model
wide_deep
model.ckpt-21177.index 2KB
events.out.tfevents.1533953954.WINDOWS-5V3K5HI 4.88MB
eval
events.out.tfevents.1533953976.WINDOWS-5V3K5HI 1.14MB
events.out.tfevents.1533891942.20160331-6353 1.17MB
events.out.tfevents.1533893597.20160331-6353 1.17MB
events.out.tfevents.1533890175.20160331-6353 1.17MB
events.out.tfevents.1533892254.20160331-6353 1.17MB
model.ckpt-24435.data-00000-of-00001 179KB
model.ckpt-21177.meta 624KB
events.out.tfevents.1533890167.20160331-6353 5.16MB
model.ckpt-24435.index 2KB
model.ckpt-21177.data-00000-of-00001 179KB
model.ckpt-22806.index 2KB
events.out.tfevents.1533893588.20160331-6353 5.17MB
model.ckpt-22806.meta 624KB
graph.pbtxt 1.35MB
model.ckpt-22806.data-00000-of-00001 179KB
model.ckpt-22807.data-00000-of-00001 179KB
model.ckpt-21178.data-00000-of-00001 179KB
model.ckpt-22807.index 2KB
checkpoint 277B
model.ckpt-24435.meta 624KB
model.ckpt-21178.index 2KB
events.out.tfevents.1533892246.20160331-6353 5.17MB
events.out.tfevents.1533891934.20160331-6353 5.17MB
model.ckpt-22807.meta 624KB
model.ckpt-21178.meta 624KB
wide_component
model.ckpt-0.index 1KB
eval
events.out.tfevents.1533890712.20160331-6353 1.02MB
events.out.tfevents.1533894224.20160331-6353 1.02MB
model.ckpt-0.data-00001-of-00002 36KB
events.out.tfevents.1533890706.20160331-6353 1.77MB
events.out.tfevents.1533894219.20160331-6353 1.77MB
model.ckpt-0.data-00000-of-00002 8B
model.ckpt-64.data-00000-of-00002 8B
model.ckpt-64.index 1KB
model.ckpt-128.index 1KB
graph.pbtxt 1.02MB
model.ckpt-0.meta 501KB
model.ckpt-128.data-00000-of-00002 8B
model.ckpt-128.meta 501KB
model.ckpt-64.data-00001-of-00002 36KB
checkpoint 172B
model.ckpt-64.meta 501KB
model.ckpt-128.data-00001-of-00002 36KB
wide_deep.py 6KB
.idea
workspace.xml 24KB
wide_deep.iml 459B
misc.xml 213B
modules.xml 270B
inspectionProfiles
profiles_settings.xml 228B
data
adult.data 3.36MB
adult.test 1.68MB
共 57 条
- 1
资源评论
- 静美书斋2020-09-22IDEA中运行可用,感谢。
dawnohdawn
- 粉丝: 1
- 资源: 5
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功