Error while using tensorflow transform' writeTransform function - apache-beam

I'm currently using Tensorflow transform library to convert and save the transformation, though it used to work before just fine currently im facing a bit of issue something similar to below
I keep getting the same error like -
'BeamDatasetMetadata' object has no attribute 'schema' [while running
'AnalyzeAndTransformDataset/TransformDataset/ConvertAndUnbatch']
Is someone familiar with the error above and how can we resolve it?
My Transform Function looks like below -
# ### Transformation Function
def transform_data(train_data_file, test_data_file, working_dir):
"""Transform the data and write out as a TFRecord of Example protos.
Read in the data using the CSV reader, and transform it using a
preprocessing pipeline that scales numeric data and converts categorical data
from strings to int64 values indices, by creating a vocabulary for each
category.
Args:
train_data_file: File containing training data
test_data_file: File containing test data
working_dir: Directory to write transformed data and metadata to
"""
def preprocessing_fn(inputs):
"""Preprocess input columns into transformed columns."""
outputs = {}
# Scale numeric columns to have range [0, 1].
for key in NUMERIC_FEATURE_KEYS:
outputs[key] = tft.scale_to_0_1(inputs[key])
# For all categorical columns except the label column, we use
# tft.string_to_int which computes the set of unique values and uses this
# to convert the strings to indices.
for key in CATEGORICAL_FEATURE_KEYS:
tft.uniques(inputs[key], vocab_filename=key)
""" We would use the lookup table when the label is a string value
In our case here Creative_id = 0/1 so we can direclty assign output as is
"""
outputs[LABEL_KEY] = inputs[LABEL_KEY]
return outputs
# The "with" block will create a pipeline, and run that pipeline at the exit
# of the block.
with beam.Pipeline() as pipeline:
with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
# Create a coder to read the data with the schema. To do this we
# need to list all columns in order since the schema doesn't specify the
# order of columns in the csv.
ordered_columns = [
'app_category', 'connection_type', 'creative_id', 'day_of_week',
'device_size', 'geo', 'hour_of_day', 'num_of_connects',
'num_of_conversions', 'opt_bid', 'os_version'
]
converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)
# Read in raw data and convert using CSV converter. Note that we apply
# some Beam transformations here, which will not be encoded in the TF
# graph since we don't do the from within tf.Transform's methods
# (AnalyzeDataset, TransformDataset etc.). These transformations are just
# to get data into a format that the CSV converter can read, in particular
# removing empty lines and removing spaces after commas.
raw_data = (
pipeline
| 'ReadTrainData' >> textio.ReadFromText(train_data_file)
| 'FilterTrainData' >> beam.Filter(
lambda line: line and line != 'app_category,connection_type,creative_id,day_of_week,device_size,geo,hour_of_day,num_of_connects,num_of_conversions,opt_bid,os_version')
| 'FixCommasTrainData' >> beam.Map(
lambda line: line.replace(', ', ','))
| 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))
# Combine data and schema into a dataset tuple. Note that we already used
# the schema to read the CSV data, but we also need it to interpret
# raw_data.
raw_dataset = (raw_data, RAW_DATA_METADATA)
transformed_dataset, transform_fn = (
raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset
transformed_data_coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
_ = (
transformed_data
| 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
| 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))
# Now apply transform function to test data. In this case we also remove
# the header line from the CSV file and the trailing period at the end of
# each line.
raw_test_data = (
pipeline
| 'ReadTestData' >> textio.ReadFromText(test_data_file, skip_header_lines=1)
| 'FixCommasTestData' >> beam.Map(
lambda line: line.replace(', ', ','))
| 'DecodeTestData' >> beam.Map(converter.decode))
raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)
transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
# Don't need transformed data schema, it's the same as before.
transformed_test_data, _ = transformed_test_dataset
_ = (
transformed_test_data
| 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
| 'WriteTestData' >> tfrecordio.WriteToTFRecord(
os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))
_ = (
transform_fn
| 'WriteTransformFn' >>
transform_fn_io.WriteTransformFn(working_dir))
Ouput stack for -
pip show tensorflow-transform apache-beam
Name: tensorflow-transform
Version: 0.4.0
Summary: A library for data preprocessing with TensorFlow
Home-page: UNKNOWN
Author: Google Inc.
Author-email: tf-transform-feedback#google.com
License: Apache 2.0
Location: /usr/local/lib/python2.7/dist-packages
Requires: six, apache-beam, protobuf
---
Name: apache-beam
Version: 2.4.0
Summary: Apache Beam SDK for Python
Home-page: https://beam.apache.org
Author: Apache Software Foundation
Author-email: dev#beam.apache.org
License: Apache License, Version 2.0
Location: /usr/local/lib/python2.7/dist-packages
Requires: oauth2client, httplib2, mock, crcmod, grpcio, futures, pyvcf, avro, typing, pyyaml, dill, six, hdfs, protobuf
You are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
the above issue doesn't seem to occur all the time though! looks like it has some conflict with other packages.

Can't see but this line looks incomplete:
```
converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)
```
A possible way to do it:
```
INPUT_SCHEMA = dataset_schema.from_feature_spec({
'label':tf.FixedLenFeature(shape=[], dtype=tf.float32),
'id': tf.FixedLenFeature(shape=[], dtype=tf.float32),
'date': tf.FixedLenFeature(shape=[], dtype=tf.string),
'random': tf.FixedLenFeature(shape=[], dtype=tf.string),
'name': tf.FixedLenFeature(shape=[], dtype=tf.string),
'tweet': tf.FixedLenFeature(shape=[], dtype=tf.string),
})
```
```
converter_input = coders.CsvCoder(
['label','id','date','random','name','tweet'],
INPUT_SCHEMA,
delimiter=delimiter)
```
Then for the transform step where it seams like your actuall problem is here is an example as well.
```
input_metadata =
dataset_metadata.DatasetMetadata(schema=TRANSFORM_INPUT_SCHEMA)
TRANSFORM_INPUT_SCHEMA = dataset_schema.from_feature_spec({
'id': tf.FixedLenFeature(shape=[], dtype=tf.float32),
'label': tf.FixedLenFeature(shape=[], dtype=tf.float32),
'tweet': tf.FixedLenFeature(shape=[], dtype=tf.string),
'answer_to_nbr': tf.FixedLenFeature(shape=[], dtype=tf.float32),
'nbr_of_tags': tf.FixedLenFeature(shape=[], dtype=tf.float32),
})
train_dataset = (train_dataset, input_metadata)
transformed_dataset, transform_fn = (train_dataset
| 'AnalyzeAndTransform' >>
beam_impl.AnalyzeAndTransformDataset(
preprocessing_fn))
```
Hope it helps you :) If you post to your github repo I could look at the full code and see if I can help! Good luck!
Look at this repo for help https://github.com/Fematich/tftransform-demo

Related

Formating issue with dump in 0.17.16-1

I have a YAML file that is usually edited by a human but, recently I have a need for it to also be edited by an automated task. I am using version 0.17.16-1 from the Ubuntu repo. I have mostly figured out how to get the output to look like the input with one exception. When there is a comment in my YAML right before an array, the first element is mis-formatted. If I remove the comment, the formatting is correct. It also doesn't matter if the comment is left-justified or indent like in the example. Most likely, I have something mis-configured so, if anyone could point it out to me I'd be very grateful. This has been driving me nuts for a few days now.
import sys
import ruamel.yaml
yaml_str = """\
top_level:
# comment
-
key1: "1"
key4: "4"
-
key2: "2"
key5: "5"
"""
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.compact(seq_map=False)
data = yaml.load(yaml_str)
yaml.dump(data, sys.stdout)
Output:
top_level:
# comment
-
key1: '1' key4: '4'
-
key2: '2'
key5: '5'
ruamel.yaml attaches comments normally to the node preceding the comment,
so you are not having a comment before a sequence (no arrays in YAML), but a comment
between a key and its value. Those can be problematic, but the main problem here
seems to be the use of yaml.compact() in combination with the comment:
import sys
import ruamel.yaml
yaml_str = """\
top_level:
# comment
-
key1: "1"
key4: "4"
-
key2: "2"
key5: "5"
"""
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes = True # this way you keep double quotes from the input
# yaml.compact(seq_map = False)
data = yaml.load(yaml_str)
yaml.dump(data, sys.stdout)
which gives:
top_level:
# comment
- key1: "1"
key4: "4"
- key2: "2"
key5: "5"
If the above really does have to have block sequence indicator on a line of its own, you can
trivially postprocess the output with the transform parameter of dump().
You should be using a Python virtual environment, and never work in the system
Python space. That allows you to install newer versions of all packages than the
system uses.

Pyspark - How to calculate file hashes

I have a bunch of CSV files in a mounted blob container and I need to calculate the 'SHA1' hash values for every file to store as inventory. I'm very new to Azure cloud and pyspark so I'm not sure how this can be achieved efficiently. I have written the following code in Python Pandas and I'm trying to use this in pyspark. It seems to work however it takes quite a while to run as there are thousands of CSV files. I understand that things work differently in pyspark, so can someone please guide if my approach is correct, or if there is a better piece of code I can use to accomplish this task?
import os
import subprocess
import hashlib
import pandas as pd
class File:
def __init__(self, path):
self.path = path
def get_hash(self):
hash = hashlib.sha1()
with open(self.path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash.update(chunk)
self.md5hash = hash.hexdigest()
return self.md5hash
path = '/dbfs/mnt/data/My_Folder' #Path to CSV files
cnt = 0
rlist = []
for path, subdirs, files in os.walk(path):
for fi in files:
if cnt < 10: #check on only 10 files for now as it takes ages!
f = File(os.path.join(path, fi))
cnt +=1
hash_value = f.get_hash()
results = {'File_Name': fi, 'File_Path': f.filename, 'SHA1_Hash_Value': hash_value}
rlist.append(results)
print(fi)
df = pd.DataFrame(rlist)
print(str(cnt) + ' files processed')
df = pd.DataFrame(rlist)
#df.to_csv('/dbfs/mnt/workspace/Inventory/File_Hashes.csv', mode='a', header=False) #not sure how to write files in pyspark!
display(df)
Thanks
Since you want to treat the files as blobs and not read them into a table. I would recommend using spark.sparkContext.binaryFiles this would land you an RDD of pairs where the key is the file name and the value is a file-like object, on which you can calculate the hash in a map function (rdd.mapValues(calculate_hash_of_file_like))
For more information, refer to the documentation: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.SparkContext.binaryFiles.html#pyspark.SparkContext.binaryFiles

OCTAVE data import from PCE-VDL data logger device and conversion of decimal coma to decimal point

I have a measurement device PCE-VDL, which gives me measurements in following CSV format below, which I need to import to OCTAVE for further investigation.
Especially I need to import last 3 columns with xyz acceleration data.
The file is in CSV format with delimiter of semicolon ";".
I have tried:
A_1 = importdata ("file.csv", ";", 3);
but have recieved
error: missing_idx(10): out of bound 9
The CSV file looks like this:
#PCE-VDL X - TableView series
#2020.16.11
#Date;Time;Duration [s];t [°C];RH [%];p [mbar];aX [g];aY [g];aZ [g];
2020.28.10;16:16:32:0000;00:000;;;;0,0195;-0,0547;1,0039;
2020.28.10;16:16:32:0052;00:005;;;;0,0898;-0,0273;0,8789;
2020.28.10;16:16:32:0104;00:010;;;;0,0977;-0,0313;0,9336;
2020.28.10;16:16:32:0157;00:015;;;;0,1016;-0,0273;0,9297;
The numbers in last 3 columns have also decimal coma and not decimal point. So there probably should be done also some conversion.
Thank you very much for any help.
Regards
EDIT: 18.11.2020
Thanks for help. I have tried now following:
A_1_str = fileread ("file.csv");
A_1_str_m = strrep (A_1_str, ".", "-");
A_1_str_m = strrep (A_1_str_m, ",", ".");
save "A_1_str_m.csv" A_1_str_m;
A_1 = importdata ("A_1_str_m.csv", ";", 8);
and still receive error: file_content(140): out of bound 139
There is probably some problem with time format in first columns, which I do not want to read. I just need last three columns.
After my conversion, the file looks like this:
# Created by Octave 5.1.0, Wed Nov 18 21:40:52 2020 CET <zdenek#ASUS-F5V>
# name: A_1_str_m
# type: sq_string
# elements: 1
# length: 7849
#PCE-VDL X - TableView series
#2020-16-11
#Date;Time;Duration [s];t [°C];RH [%];p [mbar];aX [g];aY [g];aZ [g];
2020-28-10;16:16:32:0000;00:000;;;;0.0195;-0.0547;1.0039;
2020-28-10;16:16:32:0052;00:005;;;;0.0898;-0.0273;0.8789;
2020-28-10;16:16:32:0104;00:010;;;;0.0977;-0.0313;0.9336;
Thanks for support!
You can first read the data with fileread, which stores the data as a string. Then you can manipulate the string like this:
new_string = strrep(string, ",", ".");
strrep replaces all occurrences of a pattern within a string. Afterwards you save this data as a separate file or you overwrite the existing file with the manipulated data. When this is done you proceed as you have tried before.
EDIT: 19.11.2020
To avoid the additional heading lines in the new file, you can save it like this:
fid = fopen("A_1_str_m.csv", "w");
fputs(fid, A_1_str_m);
fclose(fid);
fputs will just write the string to the file.
The you can read the new file with dlmread.
A1_buf = dlmread("A_1_str_m.csv", ";");
A1_buf = real(A1); # get the real value of the complex number
A1_buf(1:3, :) = []; # remove the headlines
A1 = A1_buf(:, end-3:end-1); # get only the the 3 columns you're looking for
This will give you the three columns your looking for. But the date and time data will be ignored.
EDIT 20.11.2020
Replaced abs with real, so the sign of the value will be kept.
Use csv2cell from the io package.

how to explicitly write two references in ruamel.yaml

If I have multiple references and when I write them to a YAML file using ruaml.yaml from Python I get:
<<: [*name-name, *help-name]
but instead I would prefer to have
<<: *name-name
<<: *help-name
Is there an option to achieve this while writing to the file?
UPDATE
descriptions:
- &description-one-ref
description: >
helptexts:
- &help-one
help_text: |
questions:
- &question-one
title: "title test"
reference: "question-one-ref"
field: "ChoiceField"
choices:
- "Yes"
- "No"
required: true
<<: *description-one-ref
<<: *help-one
riskvalue_max: 10
calculations:
- conditions:
- comparator: "equal"
value: "Yes"
actions:
- riskvalue: 0
- conditions:
- comparator: "equal"
value: "No"
actions:
- riskvalue: 10
Currently I'm reading such a file and modify specific values within python and then want to write it back. When I'm writing I'm getting the issue that the references are as list and not as outlined.
That means the workflow is as: I'm reading the doc via
yaml = ruamel.yaml.YAML()
with open('test.yaml') as f:
data = yaml.load(f)
for k in data.keys():
if k == 'questions':
q = data.get(k)
for i in range(0, len(q)):
q[i]['title'] = "my new title"
f.close()
g = open('new_file.yaml', 'w')
yaml(data)
g.close()
No, there is no such option, as it would lead to an invalid YAML file.
The << is a mapping key, for which the value is interpreted
specially assuming the parser implements to the language independent
merge key specification. And a mapping key must be unique
according to the YAML specification:
The content of a mapping node is an unordered set of key: value node
pairs, with the restriction that each of the keys is unique.
That ruamel.yaml (< 0.15.75) doesn't throw an error on such
duplicate key is a bug. On duplicate normal keys, ruamel.yaml
does throw an error. The bug is inherited from PyYAML (which is not
specification conformant, and does not throw an error even on
duplicate normal keys).
However with a little pre- and post-processing what you want to do can
be easily achieved. The trick is to make the YAML valid before parsing
by making the offending duplicate << keys unique (but recognisable)
and then, when writing the YAML back to file, substituting these
unique keys by <<: * again. In the following the first occurence of
<<: * is replaced by [<<, 0]:, the second by [<<, 1]: etc.
The * needs to be part of the substitution, as there are no anchors in
the document for those aliases.
import sys
import subprocess
import ruamel.yaml
yaml = ruamel.yaml.YAML()
yaml.preserve_quotes = True
yaml.indent(sequence=4, offset=2)
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
with open('test.yaml') as fp:
# we don't do this line by line, that would not work well on flow style mappings
orgdoc = fp.read()
doc = dmke.convert(orgdoc)
data = yaml.load(doc)
data['questions'][0].anchor.always_dump = True
#######################################
# >>>> do your thing on data here <<< #
#######################################
with open('output.yaml', 'w') as fp:
yaml.dump(data, fp, transform=dmke.revert)
res = subprocess.check_output(['diff', '-u', 'test.yaml', 'output.yaml']).decode('utf-8')
print('diff says:', res)
which gives:
diff says:
which means the files are the same on round-trip (as long as you don't
change anything before dumping).
Setting preserve_quotes and calling ident() on the YAML instance are necessary to
preserve your superfluous quotes, resp. keeping the indentation.
Since the anchor question-one has no alias, you need to enable dumping explicitly by
setting always_dump on that attribute to True. If necessary you can recursively
walk over data and set anchor.always_dump = True when .anchor.value is not None

How to write to a file name defined at runtime?

I want to write to a gs file but I don’t know the file name at compile time. Its name is based on behavior that is defined at runtime. How can I proceed?
If you're using Beam Java, you can use FileIO.writeDynamic() for this (starting with Beam 2.3 which is currently in the process of being released - but you can already use it via the version 2.3.0-SNAPSHOT), or the older DynamicDestinations API (available in Beam 2.2).
Example of using FileIO.writeDynamic() to write a PCollection of bank transactions to different paths on GCS depending on the transaction's type:
PCollection<BankTransaction> transactions = ...;
transactions.apply(
FileIO.<BankTransaction, TransactionType>writeDynamic()
.by(Transaction::getType)
.via(BankTransaction::toString, TextIO.sink())
.to("gs://bucket/myfolder/")
.withNaming(type -> defaultNaming("transactions_", ".txt"));
For an example of DynamicDestinations use, see example code in the TextIO unit tests.
Alternatively, if you want to write each record to its own file, just use the FileSystems API (in particular, FileSystems.create()) from a DoFn.
For the Python crowd:
An experimental write was added to the Beam python SDK in 2.14.0, beam.io.fileio.WriteToFiles:
my_pcollection | beam.io.fileio.WriteToFiles(
path='/my/file/path',
destination=lambda record: 'avro' if record['type'] == 'A' else 'csv',
sink=lambda dest: AvroSink() if dest == 'avro' else CsvSink(),
file_naming=beam.io.fileio.destination_prefix_naming())
which can be used to write to different files per-record.
If your filename is based on data within your pcollections, you can use the destination and file_naming to create files based on each record's data.
More documentation here:
https://beam.apache.org/releases/pydoc/2.14.0/apache_beam.io.fileio.html#dynamic-destinations
And the JIRA issue here:
https://issues.apache.org/jira/browse/BEAM-2857
As #anrope mentioned already, apache_beam.io.fileio seems to be the latest Python API for writing files. The WordCount example is currently outdated since it uses the WriteToText class, which inherits from the now deprecated apache_beam.io.filebasedsink / apache_beam.io.iobase
To add to existing answers, here is my pipeline in which I dynamically name the output files during runtime. My pipeline takes N input files and creates N output files, which are named based on their corresponding input file name.
with beam.Pipeline(options=pipeline_options) as p:
(p
| 'CreateFiles' >> beam.Create(input_file_paths)
| 'MatchFiles' >> MatchAll()
| 'OpenFiles' >> ReadMatches()
| 'LoadData' >> beam.Map(custom_data_loader)
| 'Transform' >> beam.Map(custom_data_transform)
| 'Write' >> custom_writer
)
When I load the data I create a PCollection of tuple records (file_name, data). All my transforms are applied to data, but I pass file_name through to the end of the pipeline to generate the output file names.
def custom_data_loader(f: beam.io.fileio.ReadableFile):
file_name = f.metadata.path.split('/')[-1]
data = custom_read_function(f.open())
return file_name, data
def custom_data_transform(record):
file_name, data = record
data = custom_transform_function(data) # not defined
return file_name, data
And I save the file with:
def file_naming(record):
file_name, data = record
file_name = custom_naming_function(file_name) # not defined
return file_name
def return_destination(*args):
"""Optional: Return only the last arg (destination) to avoid sharding name format"""
return args[-1]
custom_writer = WriteToFiles(
path='path/to/output',
file_naming=return_destination,
destination=file_naming,
sink=TextSink()
)
Replace all of the custom_* functions with your own logic.
I know this is a bit of an old question but I struggled with the examples in the documentation.
Here is a simple example of how to split files based on dict items.
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).save_main_session = False
def file_names(*args):
file_name = fileio.destination_prefix_naming()(*args)
destination, *_ = file_name.split("----")
return f"{destination}.json"
class JsonSink(fileio.TextSink):
def write(self, element):
record = json.loads(element)
record.pop("id")
self._fh.write(json.dumps(record).encode("utf8"))
self._fh.write("\n".encode("utf8"))
def destination(element):
return json.loads(element)["id"]
with beam.Pipeline(options=pipeline_options) as p:
data = [
{"id": 0, "message": "whhhhhhyyyyyyy"},
{"id": 1, "message": "world"},
{"id": 1, "message": "hi there!"},
{"id": 1, "message": "what's up!!!?!?!!?"},
]
(
p
| "CreateEmails" >> beam.Create(data)
| "JSONify" >> beam.Map(json.dumps)
| "Write Files"
>> fileio.WriteToFiles(
path="path/",
destination=destination,
sink=lambda dest: JsonSink(),
file_naming=file_names,
)
)