# Final stage of HELP data processing

This notebook collates the final output files ready for writing to csv for ingestion to a VO server. At the bottom of the notebook we also summarise the pipeline products which are processed on a given field. This are generated using the dmu32 meta_main.yml files which contain links to the XID+, CIGALE and photo-z catalogues which feed in to the final catalogues for publishing.

Summary of notebook:

- Take DR1 masterlist suffixes from overview table
- Find dmu32 full table names and write to a file
- Create summary of all the data products per field using the dmu32 meta_main.yml files

In [1]:
from  herschelhelp_internal  import git_version
print("This notebook was run with herschelhelp_internal version: \n{}".format(git_version()))
import datetime
print("This notebook was executed on: \n{}".format(datetime.datetime.now()))

This notebook was run with herschelhelp_internal version: 
017bb1e (Mon Jun 18 14:58:59 2018 +0100) [with local modifications]
This notebook was executed on: 
2020-10-30 13:59:28.967489


In [2]:
from astropy.table import Table, Column
from astropy import units as u
import numpy as np
import glob
from pymoc import MOC
import hashlib
from herschelhelp_internal.masterlist import find_last_ml_suffix

import yaml

import os
import time

import humanfriendly

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
The text.latex.unicode rcparam was deprecated in Matplotlib 2.2 and will be removed in 3.1.
  "2.2", name=key, obj_type="rcparam", addendum=addendum)


In [3]:
TODAY = os.environ.get('SUFFIX', time.strftime("_%Y%m%d"))

## The definition of HELP PDR1
Here we take the DR1 definition from the dmu32 yaml files which are the definition of the final and official files. 

In [4]:
yaml_files = glob.glob('./*/meta_main.yml')

In [5]:
field_yamls = [yaml.load(open(f, 'r')) for f in yaml_files]

In [6]:
field_yamls[0]

{'field': 'SA13',
 'region': 'dmu_products/dmu2/dmu2_field_coverages/SA13_MOC.fits',
 'surveys': ['LegacySurvey', 'UHS'],
 'masterlist': 'dmu_products/dmu1/dmu1_ml_SA13/data/master_catalogue_sa13_20180501.fits',
 'depths': 'dmu_products/dmu1/dmu1_ml_SA13/data/depths_sa13_20180501.fits',
 'flags': 'None',
 'xid': ['dmu_products/dmu26/dmu26_XID+SPIRE_SA13/data/dmu26_XID+SPIRE_SA13_cat_20191024.fits'],
 'photoz': 'dmu_products/dmu24/dmu24_SA13/data/SA13_DESI-DR7_Zou_et_al_2019_photo_z_withhelp_id.fits',
 'cigale': 'dmu_products/dmu28/dmu28_SA13/data/zphot/HELP_final_results.fits',
 'cigale_ldust_prediction': 'dmu_products/dmu28/dmu28_SA13/data/SA13_Ldust_prediction_results.fits',
 'final': 'dmu_products/dmu32/dmu32_SA13/data/SA13_20180501.fits'}

In [7]:
GAVO_FOLDER = '/mnt/hedam/data_vo/'
stilts_command = 'stilts tpipe {in_file} omode=out ofmt=csv out={GAVO_FOLDER}{out_file}'

final_data = open('help_to_vo.sh', 'w+')
for y in field_yamls:
    print(y['field'])
    final_help_product = y['final'].replace('dmu_products', '..')
    cigale_input = y['cigale']

    if os.path.exists(final_help_product):
        print(final_help_product)
    
        #Test with Cigale input files
        final_data.write(stilts_command.format(
            in_file=final_help_product, 
            GAVO_FOLDER=GAVO_FOLDER, 
            out_file='herschelhelp/main/{} \n'.format(
                final_help_product.split('/')[-1].replace('.fits', '.csv')
            )
        ))
        
    else:
        final_data.write('# No data for {} \n'.format(y['field']))
        
    #final_data.write('./dmu32_{}/data/{}_{}.fits'.format(field[0], field[0], field[1]))
    
final_data.close()

SA13
../dmu32/dmu32_SA13/data/SA13_20180501.fits
Bootes
../dmu32/dmu32_Bootes/data/Bootes_20190701.fits
xFLS
../dmu32/dmu32_xFLS/data/xFLS_20180501.fits
GAMA-15
../dmu32/dmu32_GAMA-15/data/GAMA-15_20180213.fits
GAMA-12
../dmu32/dmu32_GAMA-12/data/GAMA-12_20180218.fits
HDF-N
../dmu32/dmu32_HDF-N/data/HDF-N_20180427.fits
HATLAS-NGP
../dmu32/dmu32_NGP/data/NGP_20180219.fits
ELAIS-N2
../dmu32/dmu32_ELAIS-N2/data/ELAIS-N2_20180218.fits
EGS
../dmu32/dmu32_EGS/data/EGS_20180501.fits
COSMOS
../dmu32/dmu32_COSMOS/data/COSMOS_20190402.fits
HATLAS-SGP
../dmu32/dmu32_SGP/data/SGP_20180221.fits
Lockman-SWIRE
../dmu32/dmu32_Lockman-SWIRE/data/Lockman-SWIRE_20180219.fits
XMM-LSS
../dmu32/dmu32_XMM-LSS/data/XMM-LSS_20190328.fits
ELAIS-N1
../dmu32/dmu32_ELAIS-N1/data/ELAIS-N1_20171016.fits
AKARI-SEP
../dmu32/dmu32_AKARI-SEP/data/AKARI-SEP_20180221.fits
GAMA-09
../dmu32/dmu32_GAMA-09/data/GAMA-09_20180601.fits
AKARI-NEP
../dmu32/dmu32_AKARI-NEP/data/AKARI-NEP_20180215.fits
ELAIS-S1
../dmu32/dmu32_ELAIS-

The out put of this notebook is a shell script which will write all the fits files to csv files in the vo folder

In [8]:
depths_to_vo = open('depths_to_vo.sh', 'w+')
for y in field_yamls:
    final_depth_product = y['depths'].replace('dmu_products', '..')
    

    if os.path.exists(final_depth_product):
        print(final_depth_product)
    
        #Test with Cigale input files
        depths_to_vo.write(stilts_command.format(
            in_file=final_depth_product, 
            GAVO_FOLDER=GAVO_FOLDER, 
            out_file='depth/{} \n'.format(final_depth_product.split('/')[-1].replace('.fits', '.csv'))
        ))

        
    else:
        depths_to_vo.write('# No depths for {} \n'.format(y['field']))
        
    #final_data.write('./dmu32_{}/data/{}_{}.fits'.format(field[0], field[0], field[1]))
    
depths_to_vo.close()

../dmu1/dmu1_ml_SA13/data/depths_sa13_20180501.fits
../dmu1/dmu1_ml_Bootes/data/depths_bootes_20190201.fits
../dmu1/dmu1_ml_xFLS/data/depths_xfls_20180501.fits
../dmu1/dmu1_ml_GAMA-15/data/depths_gama-15_20180213.fits
../dmu1/dmu1_ml_GAMA-12/data/depths_gama-12_20180218.fits
../dmu1/dmu1_ml_HDF-N/data/depths_hdf-n_20180427.fits
../dmu1/dmu1_ml_NGP/data/depths_ngp_20180219.fits
../dmu1/dmu1_ml_ELAIS-N2/data/depths_elais-n2_20180218.fits
../dmu1/dmu1_ml_EGS/data/depths_egs_20180501.fits
../dmu1/dmu1_ml_SGP/data/depths_sgp_20180221.fits
../dmu1/dmu1_ml_Lockman-SWIRE/data/depths_lockman-swire_20180219.fits
../dmu1/dmu1_ml_ELAIS-N1/data/depths_elais-n1_20171016.fits
../dmu1/dmu1_ml_AKARI-SEP/data/depths_akari-sep_20180221.fits
../dmu1/dmu1_ml_GAMA-09/data/depths_gama-09_20180601.fits
../dmu1/dmu1_ml_AKARI-NEP/data/depths_akari-nep_20180215.fits
../dmu1/dmu1_ml_ELAIS-S1/data/depths_elais-s1_20180416.fits
../dmu1/dmu1_ml_XMM-13hr/data/depths_xmm-13hr_20180501.fits
../dmu1/dmu1_ml_Herschel-Str

## Summarise completeness of HELP data sets

Here we get information about what is available on each field to summarise the data products available per field. We take the cigale, xid+ and photo-z filenames from the per field meta_main.yml files here and check they are there and how large they are. This then given a summary of all the data present.

In [9]:
dr1 = Table()
dr1['field'] = [y['field'] for y in field_yamls]
dr1.sort('field')

In [10]:
fields_info = yaml.load(open("../dmu2/meta_main.yml", 'r'))

In [11]:
dr1['objects']             =np.full(len(dr1), 0, dtype=int)
dr1['dr1_file']            =np.full(len(dr1), 0, dtype=np.dtype('U250'))
dr1['dr1_file_hash']       =np.full(len(dr1), 0, dtype=np.dtype('U250'))
dr1['area_sq_degrees']     =np.full(len(dr1), 0, dtype='float64')
dr1['file_size_bytes']     =np.full(len(dr1), 0, dtype=int)
dr1['file_size_readable']  =np.full(len(dr1), 0, dtype=np.dtype('U250'))
dr1['xid_objects']         =np.full(len(dr1), 0, dtype=int)
dr1['photoz_objects']      =np.full(len(dr1), 0, dtype=int)
dr1['cigale_objects']      =np.full(len(dr1), 0, dtype=int)
dr1['blind_objects']      =np.full(len(dr1), 0, dtype=int)



In [12]:

def file_as_bytes(file):
    with file:
        return file.read()



In [13]:
for y in field_yamls:
    print(y['field'] + ':')
    this_row = dr1['field'] == y['field']
    final = y['final'].replace('dmu_products/', '../')
    moc = y['region'].replace('dmu_products/', '../')
    try:
        cat = Table.read(final)
        
        
        dr1['objects'][this_row]            = len(cat)
        dr1['dr1_file'][this_row]           = y['final']
        dr1['dr1_file_hash'][this_row]      = hashlib.md5(file_as_bytes(open(final, 'rb'))).hexdigest()
        dr1['area_sq_degrees'][this_row]    = help_moc = MOC(filename=moc).area_sq_deg
        size = os.stat(final).st_size
        dr1['file_size_bytes'][this_row]    = size

        dr1['file_size_readable'][this_row] = humanfriendly.format_size(size)
        dr1['xid_objects'][this_row]        = np.sum(cat['f_spire_500']>0)
        dr1['photoz_objects'][this_row]     =  np.sum(cat['redshift']>0)
        dr1['cigale_objects'][this_row]     = np.sum(cat['cigale_sfr']>0)
        
        blind = Table.read('../dmu22/dmu22_{}/data/dmu22_XID+SPIRE_{}_BLIND_Matched_MF.fits'.format(
            y['field'],y['field']))
        dr1['blind_objects'][this_row]     = len(blind)
    
    except:
        print('Problem reading {}'.format(y['final']))

SA13:


  return getattr(self.data, op)(other)


Bootes:
xFLS:
GAMA-15:
GAMA-12:
HDF-N:
Problem reading dmu_products/dmu32/dmu32_HDF-N/data/HDF-N_20180427.fits
HATLAS-NGP:
ELAIS-N2:
EGS:
COSMOS:
HATLAS-SGP:
Lockman-SWIRE:
XMM-LSS:
ELAIS-N1:
AKARI-SEP:
GAMA-09:
AKARI-NEP:
ELAIS-S1:
XMM-13hr:
Herschel-Stripe-82:
CDFS-SWIRE:
SPIRE-NEP:
SSDF:


In [14]:
final

'../dmu32/dmu32_SSDF/data/SSDF_20180221.fits'

In [15]:
dr1.show_in_notebook()

idx,field,objects,dr1_file,dr1_file_hash,area_sq_degrees,file_size_bytes,file_size_readable,xid_objects,photoz_objects,cigale_objects
0,AKARI-NEP,531746,dmu_products/dmu32/dmu32_AKARI-NEP/data/AKARI-NEP_20180215.fits,5b33074e816785e05407952f03e78354,9.194732358779468,433440000,433.44 MB,31441,107228,1239
1,AKARI-SEP,844172,dmu_products/dmu32/dmu32_AKARI-SEP/data/AKARI-SEP_20180221.fits,56889960f654742f80329e8ebb0053d5,8.713306475131118,668652480,668.65 MB,108119,139059,566
2,Bootes,3398098,dmu_products/dmu32/dmu32_Bootes/data/Bootes_20190701.fits,6d2f18ef25a42999d805b258c751fde2,11.42815299095486,5556003840,5.56 GB,495159,1570512,38980
3,CDFS-SWIRE,2171051,dmu_products/dmu32/dmu32_CDFS-SWIRE/data/CDFS-SWIRE_20180613.fits,7bb01da6145d35ffa157b7c3ee0209ce,12.971246403717068,6059603520,6.06 GB,73511,136944,9308
4,COSMOS,2599374,dmu_products/dmu32/dmu32_COSMOS/data/COSMOS_20190402.fits,c900763f01cd327b84fafa3defe46151,5.083863478496816,10954031040,10.95 GB,25898,691502,15747
5,EGS,1412613,dmu_products/dmu32/dmu32_EGS/data/EGS_20180501.fits,74ce7e6bd7a982141d7558fad62b38df,3.566383275122158,4018965120,4.02 GB,223598,1182503,4159
6,ELAIS-N1,4026292,dmu_products/dmu32/dmu32_ELAIS-N1/data/ELAIS-N1_20171016.fits,97ccb7d86c92aa9e9ba0657c9e737276,13.507484555454765,6007360320,6.01 GB,269611,2714686,49985
7,ELAIS-N2,1783240,dmu_products/dmu32/dmu32_ELAIS-N2/data/ELAIS-N2_20180218.fits,09628a8b89fd5eeea27d6059988eb900,9.167479903991111,2321884800,2.32 GB,86591,120723,6798
8,ELAIS-S1,1655564,dmu_products/dmu32/dmu32_ELAIS-S1/data/ELAIS-S1_20180416.fits,e1e31bd8ae8b7646c47aaec6a40e2cf4,9.002940646885508,2231798400,2.23 GB,194276,1013582,25393
9,GAMA-09,12937982,dmu_products/dmu32/dmu32_GAMA-09/data/GAMA-09_20180601.fits,6712784e0dd54abaca5dc46b82a2c7a5,62.01393417284915,21839469120,21.84 GB,1386659,8833874,130293


In [16]:
print("""Totals:
Area:              {} Square degrees
Objects :          {}
XID+ objects :     {} (= {} percent)
Redshifts :        {} (= {} percent)
CIGALE objects :   {} (= {} percent)
Final tables size: {}
""".format(
    round(np.sum(dr1['area_sq_degrees'])),
    np.sum(dr1['objects']),
    np.sum(dr1['xid_objects']), round(100*np.sum(dr1['xid_objects'])/np.sum(dr1['objects'])),
    np.sum(dr1['photoz_objects']), round(100*np.sum(dr1['photoz_objects'])/np.sum(dr1['objects'])),
    np.sum(dr1['cigale_objects']), round(100*np.sum(dr1['cigale_objects'])/np.sum(dr1['objects'])),
    humanfriendly.format_size(np.sum(dr1['file_size_bytes']))
))

Totals:
Area:              1269.0 Square degrees
Objects :          171604418
XID+ objects :     18010387 (= 10.0 percent)
Redshifts :        93771513 (= 55.0 percent)
CIGALE objects :   1717895 (= 1.0 percent)
Final tables size: 328.64 GB



In [17]:
dr1.write('dr1_data_products_overview{}.csv'.format(TODAY), overwrite=True)